Merge branch 'for-linus' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 6 Aug 2016 03:31:51 +0000 (23:31 -0400)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 6 Aug 2016 03:31:51 +0000 (23:31 -0400)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 6 Aug 2016 03:31:51 +0000 (23:31 -0400)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 6 Aug 2016 03:31:51 +0000 (23:31 -0400)
diff --git a/Documentation/devicetree/bindings/input/rotary-encoder.txt b/Documentation/devicetree/bindings/input/rotary-encoder.txt

index 6c9f0c8a846c89a18532d56c488b7b133327113b..e85ce3dea4806bfbf9769d27514458603166e632 100644 (file)
--- a/Documentation/devicetree/bindings/input/rotary-encoder.txt
+++ b/Documentation/devicetree/bindings/input/rotary-encoder.txt
@@ -20,6 +20,8 @@ Optional properties:
    2: Half-period mode
    4: Quarter-period mode
  - wakeup-source: Boolean, rotary encoder can wake up the system.
+- rotary-encoder,encoding: String, the method used to encode steps.
+  Supported are "gray" (the default and more common) and "binary".
  
  Deprecated properties:
  - rotary-encoder,half-period: Makes the driver work on half-period mode.
@@ -34,6 +36,7 @@ Example:
                         compatible = "rotary-encoder";
                         gpios = <&gpio 19 1>, <&gpio 20 0>; /* GPIO19 is inverted */
                         linux,axis = <0>; /* REL_X */
+                       rotary-encoder,encoding = "gray";
                         rotary-encoder,relative-axis;
                 };
  
@@ -42,5 +45,6 @@ Example:
                         gpios = <&gpio 21 0>, <&gpio 22 0>;
                         linux,axis = <1>; /* ABS_Y */
                         rotary-encoder,steps = <24>;
+                       rotary-encoder,encoding = "binary";
                         rotary-encoder,rollover;
                 };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt b/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt

new file mode 100644 (file)

index 0000000..1112e0d
--- /dev/null
+++ b/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt
@@ -0,0 +1,36 @@
+* GSL 1680 touchscreen controller
+
+Required properties:
+- compatible             : "silead,gsl1680"
+- reg                    : I2C slave address of the chip (0x40)
+- interrupt-parent       : a phandle pointing to the interrupt controller
+                           serving the interrupt for this chip
+- interrupts             : interrupt specification for the gsl1680 interrupt
+- power-gpios            : Specification for the pin connected to the gsl1680's
+                           shutdown input. This needs to be driven high to take the
+                           gsl1680 out of its low power state
+- touchscreen-size-x     : See touchscreen.txt
+- touchscreen-size-y     : See touchscreen.txt
+
+Optional properties:
+- touchscreen-inverted-x  : See touchscreen.txt
+- touchscreen-inverted-y  : See touchscreen.txt
+- touchscreen-swapped-x-y : See touchscreen.txt
+- silead,max-fingers     : maximum number of fingers the touchscreen can detect
+
+Example:
+
+i2c@00000000 {
+       gsl1680: touchscreen@40 {
+               compatible = "silead,gsl1680";
+               reg = <0x40>;
+               interrupt-parent = <&pio>;
+               interrupts = <6 11 IRQ_TYPE_EDGE_FALLING>;
+               power-gpios = <&pio 1 3 GPIO_ACTIVE_HIGH>;
+               touchscreen-size-x = <480>;
+               touchscreen-size-y = <800>;
+               touchscreen-inverted-x;
+               touchscreen-swapped-x-y;
+               silead,max-fingers = <5>;
+       };
+};
diff --git a/Documentation/devicetree/bindings/input/touchscreen/sis_i2c.txt b/Documentation/devicetree/bindings/input/touchscreen/sis_i2c.txt

new file mode 100644 (file)

index 0000000..d87ad14
--- /dev/null
+++ b/Documentation/devicetree/bindings/input/touchscreen/sis_i2c.txt
@@ -0,0 +1,33 @@
+* SiS I2C Multiple Touch Controller
+
+Required properties:
+- compatible: must be "sis,9200-ts"
+- reg: i2c slave address
+- interrupt-parent: the phandle for the interrupt controller
+  (see interrupt binding [0])
+- interrupts: touch controller interrupt (see interrupt
+  binding [0])
+
+Optional properties:
+- pinctrl-names: should be "default" (see pinctrl binding [1]).
+- pinctrl-0: a phandle pointing to the pin settings for the
+  device (see pinctrl binding [1]).
+- attn-gpios: the gpio pin used as attention line
+- reset-gpios: the gpio pin used to reset the controller
+- wakeup-source: touchscreen can be used as a wakeup source
+
+[0]: Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
+[1]: Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
+
+Example:
+
+       sis9255@5c  {
+               compatible = "sis,9200-ts";
+               reg = <0x5c>;
+               pinctrl-names = "default";
+               pinctrl-0 = <&pinctrl_sis>;
+               interrupt-parent = <&gpio3>;
+               interrupts = <19 IRQ_TYPE_EDGE_FALLING>;
+               irq-gpios = <&gpio3 19 GPIO_ACTIVE_LOW>;
+               reset-gpios = <&gpio2 30 GPIO_ACTIVE_LOW>;
+       };
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt

index 68391a462c0a52c8b96d66edc55b863af8e7d974..1992aa97d45ac8b0a9cfaefbaf5de40780c601ad 100644 (file)
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -238,6 +238,7 @@ simtek
  sii    Seiko Instruments, Inc.
  silergy        Silergy Corp.
  sirf   SiRF Technology, Inc.
+sis    Silicon Integrated Systems Corp.
  sitronix       Sitronix Technology Corporation
  skyworks       Skyworks Solutions, Inc.
  smsc   Standard Microsystems Corporation
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 1a855d0c11fa7c07d7e69f9983f478aa26c850e6..eb0a0582d912fdc4dc71f7bf27571a765d10bf15 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3877,6 +3877,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         using these two parameters to set the minimum and
                         maximum port values.
  
+       sunrpc.svc_rpc_per_connection_limit=
+                       [NFS,SUNRPC]
+                       Limit the number of requests that the server will
+                       process in parallel from a single connection.
+                       The default value is 0 (no limit).
+
         sunrpc.pool_mode=
                         [NFS]
                         Control how the NFS server code allocates CPUs to
diff --git a/MAINTAINERS b/MAINTAINERS

index bafc8043d4f0df1858f9220ebdab17e932bf4bbc..e9c75275405dabf20ccc9145448ab2d3c5413ac9 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3219,7 +3219,7 @@ M:        Johannes Weiner <hannes@cmpxchg.org>
  L:     cgroups@vger.kernel.org
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
  S:     Maintained
-F:     Documentation/cgroups/
+F:     Documentation/cgroup*
  F:     include/linux/cgroup*
  F:     kernel/cgroup*
  
@@ -3230,7 +3230,7 @@ W:        http://www.bullopensource.org/cpuset/
  W:     http://oss.sgi.com/projects/cpusets/
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
  S:     Maintained
-F:     Documentation/cgroups/cpusets.txt
+F:     Documentation/cgroup-v1/cpusets.txt
  F:     include/linux/cpuset.h
  F:     kernel/cpuset.c
  
@@ -5831,7 +5831,15 @@ M:       Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
  L:     linux-scsi@vger.kernel.org
  S:     Supported
  F:     drivers/scsi/ibmvscsi/ibmvscsi*
-F:     drivers/scsi/ibmvscsi/viosrp.h
+F:     include/scsi/viosrp.h
+
+IBM Power Virtual SCSI Device Target Driver
+M:     Bryant G. Ly <bryantly@linux.vnet.ibm.com>
+M:     Michael Cyr <mikecyr@linux.vnet.ibm.com>
+L:     linux-scsi@vger.kernel.org
+L:     target-devel@vger.kernel.org
+S:     Supported
+F:     drivers/scsi/ibmvscsi_tgt/
  
  IBM Power Virtual FC Device Drivers
  M:     Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
@@ -7639,6 +7647,15 @@ W:       http://www.mellanox.com
  Q:     http://patchwork.ozlabs.org/project/netdev/list/
  F:     drivers/net/ethernet/mellanox/mlxsw/
  
+SOFT-ROCE DRIVER (rxe)
+M:     Moni Shoua <monis@mellanox.com>
+L:     linux-rdma@vger.kernel.org
+S:     Supported
+W:     https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home
+Q:     http://patchwork.kernel.org/project/linux-rdma/list/
+F:     drivers/infiniband/hw/rxe/
+F:     include/uapi/rdma/rdma_user_rxe.h
+
  MEMBARRIER SUPPORT
  M:     Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  M:     "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
@@ -9811,10 +9828,14 @@ L:      rtc-linux@googlegroups.com
  Q:     http://patchwork.ozlabs.org/project/rtc-linux/list/
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git
  S:     Maintained
+F:     Documentation/devicetree/bindings/rtc/
  F:     Documentation/rtc.txt
  F:     drivers/rtc/
  F:     include/linux/rtc.h
  F:     include/uapi/linux/rtc.h
+F:     include/linux/rtc/
+F:     include/linux/platform_data/rtc-*
+F:     tools/testing/selftests/timers/rtctest.c
  
  REALTEK AUDIO CODECS
  M:     Bard Liao <bardliao@realtek.com>
diff --git a/arch/alpha/include/asm/rtc.h b/arch/alpha/include/asm/rtc.h

deleted file mode 100644 (file)

index f71c3b0..0000000
--- a/arch/alpha/include/asm/rtc.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/rtc.h>
diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c

index 53dd2f1a53aabd25f188053e2e78fb9510d45d75..d5f0580746a5d632452816b00427c212230b0810 100644 (file)
--- a/arch/alpha/kernel/core_marvel.c
+++ b/arch/alpha/kernel/core_marvel.c
@@ -24,7 +24,6 @@
  #include <asm/gct.h>
  #include <asm/pgalloc.h>
  #include <asm/tlbflush.h>
-#include <asm/rtc.h>
  #include <asm/vga.h>
  
  #include "proto.h"
diff --git a/arch/alpha/kernel/rtc.c b/arch/alpha/kernel/rtc.c

index f535a3fd0f60cc9651e89b83cb821b510374d087..ceed68c7500bab70a18d39c55dda9c87432b7867 100644 (file)
--- a/arch/alpha/kernel/rtc.c
+++ b/arch/alpha/kernel/rtc.c
@@ -15,8 +15,6 @@
  #include <linux/rtc.h>
  #include <linux/platform_device.h>
  
-#include <asm/rtc.h>
-
  #include "proto.h"
  
  
@@ -81,7 +79,7 @@ init_rtc_epoch(void)
  static int
  alpha_rtc_read_time(struct device *dev, struct rtc_time *tm)
  {
-       __get_rtc_time(tm);
+       mc146818_get_time(tm);
  
         /* Adjust for non-default epochs.  It's easier to depend on the
            generic __get_rtc_time and adjust the epoch here than create
@@ -112,7 +110,7 @@ alpha_rtc_set_time(struct device *dev, struct rtc_time *tm)
                 tm = &xtm;
         }
  
-       return __set_rtc_time(tm);
+       return mc146818_set_time(tm);
  }
  
  static int
diff --git a/arch/arm/mach-ep93xx/ts72xx.c b/arch/arm/mach-ep93xx/ts72xx.c

index 45b81a2bcd4b39ca121c5eedb04c6600c65ec65d..3b39ea353d3075b687a6f30a4688bab8994ab625 100644 (file)
--- a/arch/arm/mach-ep93xx/ts72xx.c
+++ b/arch/arm/mach-ep93xx/ts72xx.c
@@ -16,7 +16,7 @@
  #include <linux/init.h>
  #include <linux/platform_device.h>
  #include <linux/io.h>
-#include <linux/m48t86.h>
+#include <linux/platform_data/rtc-m48t86.h>
  #include <linux/mtd/nand.h>
  #include <linux/mtd/partitions.h>
  
diff --git a/arch/arm/mach-orion5x/ts78xx-setup.c b/arch/arm/mach-orion5x/ts78xx-setup.c

index 3a58a5d4a28a00d77df5b22ddf89a5282074ebc6..8d597267d0c457530ca097b0730f251fd2d4c7bd 100644 (file)
--- a/arch/arm/mach-orion5x/ts78xx-setup.c
+++ b/arch/arm/mach-orion5x/ts78xx-setup.c
@@ -16,7 +16,7 @@
  #include <linux/platform_device.h>
  #include <linux/mv643xx_eth.h>
  #include <linux/ata_platform.h>
-#include <linux/m48t86.h>
+#include <linux/platform_data/rtc-m48t86.h>
  #include <linux/mtd/nand.h>
  #include <linux/mtd/partitions.h>
  #include <linux/timeriomem-rng.h>
diff --git a/arch/arm/mach-pxa/cm-x270.c b/arch/arm/mach-pxa/cm-x270.c

index fa5f51d633a3edc01afc39aac8f134fac857df1b..be4a66166d6196b2838605db2857e8b5cd012f97 100644 (file)
--- a/arch/arm/mach-pxa/cm-x270.c
+++ b/arch/arm/mach-pxa/cm-x270.c
@@ -14,7 +14,7 @@
  #include <linux/gpio.h>
  #include <linux/delay.h>
  
-#include <linux/rtc-v3020.h>
+#include <linux/platform_data/rtc-v3020.h>
  #include <video/mbxfb.h>
  
  #include <linux/spi/spi.h>
diff --git a/arch/arm/mach-pxa/cm-x300.c b/arch/arm/mach-pxa/cm-x300.c

index 5f5ac7c8faf049e44fef4c4671b4f135c25506e0..868448d2cd8251620af75775dabfe8eb26e60c5d 100644 (file)
--- a/arch/arm/mach-pxa/cm-x300.c
+++ b/arch/arm/mach-pxa/cm-x300.c
@@ -25,7 +25,7 @@
  #include <linux/gpio.h>
  #include <linux/dm9000.h>
  #include <linux/leds.h>
-#include <linux/rtc-v3020.h>
+#include <linux/platform_data/rtc-v3020.h>
  #include <linux/pwm.h>
  #include <linux/pwm_backlight.h>
  
diff --git a/arch/arm/mach-pxa/em-x270.c b/arch/arm/mach-pxa/em-x270.c

index 6e0268deec43111c820e406897c5a56bad857dcf..03354c21e1f2c3d49b2cedc3d6ff8599600a61fa 100644 (file)
--- a/arch/arm/mach-pxa/em-x270.c
+++ b/arch/arm/mach-pxa/em-x270.c
@@ -14,7 +14,7 @@
  #include <linux/delay.h>
  
  #include <linux/dm9000.h>
-#include <linux/rtc-v3020.h>
+#include <linux/platform_data/rtc-v3020.h>
  #include <linux/mtd/nand.h>
  #include <linux/mtd/partitions.h>
  #include <linux/mtd/physmap.h>
diff --git a/arch/frv/include/asm/mc146818rtc.h b/arch/frv/include/asm/mc146818rtc.h

deleted file mode 100644 (file)

index 90dfb7a..0000000
--- a/arch/frv/include/asm/mc146818rtc.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* mc146818rtc.h: RTC defs
- *
- * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _ASM_MC146818RTC_H
-#define _ASM_MC146818RTC_H
-
-
-#endif /* _ASM_MC146818RTC_H */
diff --git a/arch/h8300/include/asm/mc146818rtc.h b/arch/h8300/include/asm/mc146818rtc.h

deleted file mode 100644 (file)

index ab9d964..0000000
--- a/arch/h8300/include/asm/mc146818rtc.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * Machine dependent access functions for RTC registers.
- */
-#ifndef _H8300_MC146818RTC_H
-#define _H8300_MC146818RTC_H
-
-/* empty include file to satisfy the include in genrtc.c/ide-geometry.c */
-
-#endif /* _H8300_MC146818RTC_H */
diff --git a/arch/ia64/include/asm/mc146818rtc.h b/arch/ia64/include/asm/mc146818rtc.h

deleted file mode 100644 (file)

index 407787a..0000000
--- a/arch/ia64/include/asm/mc146818rtc.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _ASM_IA64_MC146818RTC_H
-#define _ASM_IA64_MC146818RTC_H
-
-/*
- * Machine dependent access functions for RTC registers.
- */
-
-/* empty include file to satisfy the include in genrtc.c */
-
-#endif /* _ASM_IA64_MC146818RTC_H */
diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c

index 01693df7f2f6ae261416083266991b91669f7aa9..ec9cc1fdd237dec4e776aab29bb36816646a30a7 100644 (file)
--- a/arch/m68k/amiga/config.c
+++ b/arch/m68k/amiga/config.c
@@ -35,7 +35,6 @@
  #include <asm/amigahw.h>
  #include <asm/amigaints.h>
  #include <asm/irq.h>
-#include <asm/rtc.h>
  #include <asm/machdep.h>
  #include <asm/io.h>
  
diff --git a/arch/m68k/apollo/config.c b/arch/m68k/apollo/config.c

index 6e62d66c396eecde7acaf50c5241011fbaa0983a..432bc8bacfc2b376dcfcb5a439a6f16e637ab503 100644 (file)
--- a/arch/m68k/apollo/config.c
+++ b/arch/m68k/apollo/config.c
@@ -15,7 +15,6 @@
  #include <asm/pgtable.h>
  #include <asm/apollohw.h>
  #include <asm/irq.h>
-#include <asm/rtc.h>
  #include <asm/machdep.h>
  
  u_long sio01_physaddr;
diff --git a/arch/m68k/bvme6000/config.c b/arch/m68k/bvme6000/config.c

index 478623dbb2092b357a1f355d382ad3cc272f8e95..611d4d9ea2bd80aca5cdbf8425fcdbe4c2e8f67b 100644 (file)
--- a/arch/m68k/bvme6000/config.c
+++ b/arch/m68k/bvme6000/config.c
@@ -34,7 +34,6 @@
  #include <asm/setup.h>
  #include <asm/irq.h>
  #include <asm/traps.h>
-#include <asm/rtc.h>
  #include <asm/machdep.h>
  #include <asm/bvme6000hw.h>
  
diff --git a/arch/m68k/hp300/config.c b/arch/m68k/hp300/config.c

index a9befe65adc42b16e49ee318812eecb8d9b3a3f3..7cfab158fb616218cadeab81f202a684f05d9565 100644 (file)
--- a/arch/m68k/hp300/config.c
+++ b/arch/m68k/hp300/config.c
@@ -12,6 +12,7 @@
  #include <linux/string.h>
  #include <linux/kernel.h>
  #include <linux/console.h>
+#include <linux/rtc.h>
  
  #include <asm/bootinfo.h>
  #include <asm/bootinfo-hp300.h>
@@ -20,7 +21,6 @@
  #include <asm/blinken.h>
  #include <asm/io.h>                               /* readb() and writeb() */
  #include <asm/hp300hw.h>
-#include <asm/rtc.h>
  
  #include "time.h"
  
diff --git a/arch/m68k/include/asm/flat.h b/arch/m68k/include/asm/flat.h

index f9454b89a51b9c763c6538688b1de1cc6fc01736..00c392b0cabdb8288c46b4b27008e4c364ed2ac4 100644 (file)
--- a/arch/m68k/include/asm/flat.h
+++ b/arch/m68k/include/asm/flat.h
@@ -1,5 +1,5 @@
  /*
- * include/asm-m68knommu/flat.h -- uClinux flat-format executables
+ * flat.h -- uClinux flat-format executables
   */
  
  #ifndef __M68KNOMMU_FLAT_H__
@@ -8,8 +8,9 @@
  #define        flat_argvp_envp_on_stack()              1
  #define        flat_old_ram_flag(flags)                (flags)
  #define        flat_reloc_valid(reloc, size)           ((reloc) <= (size))
-#define        flat_get_addr_from_rp(rp, relval, flags, p)     get_unaligned(rp)
-#define        flat_put_addr_at_rp(rp, val, relval)    put_unaligned(val,rp)
+#define        flat_get_addr_from_rp(rp, relval, flags, p) \
+       ({ unsigned long __val; __get_user_unaligned(__val, rp); __val; })
+#define        flat_put_addr_at_rp(rp, val, relval)    __put_user_unaligned(val, rp)
  #define        flat_get_relocate_addr(rel)             (rel)
  
  static inline int flat_set_persistent(unsigned long relval,
@@ -18,4 +19,10 @@ static inline int flat_set_persistent(unsigned long relval,
         return 0;
  }
  
+#define FLAT_PLAT_INIT(regs) \
+       do { \
+               if (current->mm) \
+                       (regs)->d5 = current->mm->start_data; \
+       } while (0)
+
  #endif /* __M68KNOMMU_FLAT_H__ */
diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h

index a6ce2ec8d693f1643c93fefc70d9934954b6a716..c84a2183b3f0abb6110a2c17840d90baac59eb29 100644 (file)
--- a/arch/m68k/include/asm/processor.h
+++ b/arch/m68k/include/asm/processor.h
@@ -110,7 +110,6 @@ struct thread_struct {
  #define setframeformat(_regs)  do { } while (0)
  #endif
  
-#ifdef CONFIG_MMU
  /*
   * Do necessary setup to start up a newly executed thread.
   */
@@ -123,26 +122,14 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc,
         wrusp(usp);
  }
  
+#ifdef CONFIG_MMU
  extern int handle_kernel_fault(struct pt_regs *regs);
-
  #else
-
-#define start_thread(_regs, _pc, _usp)                  \
-do {                                                    \
-       (_regs)->pc = (_pc);                            \
-       setframeformat(_regs);                          \
-       if (current->mm)                                \
-               (_regs)->d5 = current->mm->start_data;  \
-       (_regs)->sr &= ~0x2000;                         \
-       wrusp(_usp);                                    \
-} while(0)
-
  static inline  int handle_kernel_fault(struct pt_regs *regs)
  {
         /* Any fault in kernel is fatal on non-mmu */
         return 0;
  }
-
  #endif
  
  /* Forward declaration, a strange C thing */
diff --git a/arch/m68k/include/asm/rtc.h b/arch/m68k/include/asm/rtc.h

deleted file mode 100644 (file)

index a4d08ea..0000000
--- a/arch/m68k/include/asm/rtc.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* include/asm-m68k/rtc.h
- *
- * Copyright Richard Zidlicky
- * implementation details for genrtc/q40rtc driver
- */
-/* permission is hereby granted to copy, modify and redistribute this code
- * in terms of the GNU Library General Public License, Version 2 or later,
- * at your option.
- */
-
-#ifndef _ASM_RTC_H
-#define _ASM_RTC_H
-
-#ifdef __KERNEL__
-
-#include <linux/rtc.h>
-#include <asm/errno.h>
-#include <asm/machdep.h>
-
-#define RTC_PIE 0x40           /* periodic interrupt enable */
-#define RTC_AIE 0x20           /* alarm interrupt enable */
-#define RTC_UIE 0x10           /* update-finished interrupt enable */
-
-/* some dummy definitions */
-#define RTC_BATT_BAD 0x100     /* battery bad */
-#define RTC_SQWE 0x08          /* enable square-wave output */
-#define RTC_DM_BINARY 0x04     /* all time/date values are BCD if clear */
-#define RTC_24H 0x02           /* 24 hour mode - else hours bit 7 means pm */
-#define RTC_DST_EN 0x01                /* auto switch DST - works f. USA only */
-
-static inline unsigned int get_rtc_time(struct rtc_time *time)
-{
-       /*
-        * Only the values that we read from the RTC are set. We leave
-        * tm_wday, tm_yday and tm_isdst untouched. Even though the
-        * RTC has RTC_DAY_OF_WEEK, we ignore it, as it is only updated
-        * by the RTC when initially set to a non-zero value.
-        */
-       if (mach_hwclk)
-               mach_hwclk(0, time);
-       return RTC_24H;
-}
-
-static inline int set_rtc_time(struct rtc_time *time)
-{
-       if (mach_hwclk)
-               return mach_hwclk(1, time);
-       return -EINVAL;
-}
-
-static inline unsigned int get_rtc_ss(void)
-{
-       if (mach_get_ss)
-               return mach_get_ss();
-       else{
-               struct rtc_time h;
-
-               get_rtc_time(&h);
-               return h.tm_sec;
-       }
-}
-
-static inline int get_rtc_pll(struct rtc_pll_info *pll)
-{
-       if (mach_get_rtc_pll)
-               return mach_get_rtc_pll(pll);
-       else
-               return -EINVAL;
-}
-static inline int set_rtc_pll(struct rtc_pll_info *pll)
-{
-       if (mach_set_rtc_pll)
-               return mach_set_rtc_pll(pll);
-       else
-               return -EINVAL;
-}
-#endif /* __KERNEL__ */
-
-#endif /* _ASM__RTC_H */
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c

index 3857737e3958fd3b4735cafd80f804c3be571ebb..4e5aa2f4f5225446fa7274d8c931a508cbb2fe0b 100644 (file)
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -86,7 +86,49 @@ void read_persistent_clock(struct timespec *ts)
         }
  }
  
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+#if defined(CONFIG_ARCH_USES_GETTIMEOFFSET) && IS_ENABLED(CONFIG_RTC_DRV_GENERIC)
+static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm)
+{
+       mach_hwclk(0, tm);
+       return rtc_valid_tm(tm);
+}
+
+static int rtc_generic_set_time(struct device *dev, struct rtc_time *tm)
+{
+       if (mach_hwclk(1, tm) < 0)
+               return -EOPNOTSUPP;
+       return 0;
+}
+
+static int rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
+{
+       struct rtc_pll_info pll;
+       struct rtc_pll_info __user *argp = (void __user *)arg;
+
+       switch (cmd) {
+       case RTC_PLL_GET:
+               if (!mach_get_rtc_pll || mach_get_rtc_pll(&pll))
+                       return -EINVAL;
+               return copy_to_user(argp, &pll, sizeof pll) ? -EFAULT : 0;
+
+       case RTC_PLL_SET:
+               if (!mach_set_rtc_pll)
+                       return -EINVAL;
+               if (!capable(CAP_SYS_TIME))
+                       return -EACCES;
+               if (copy_from_user(&pll, argp, sizeof(pll)))
+                       return -EFAULT;
+               return mach_set_rtc_pll(&pll);
+       }
+
+       return -ENOIOCTLCMD;
+}
+
+static const struct rtc_class_ops generic_rtc_ops = {
+       .ioctl = rtc_ioctl,
+       .read_time = rtc_generic_get_time,
+       .set_time = rtc_generic_set_time,
+};
  
  static int __init rtc_init(void)
  {
@@ -95,7 +137,9 @@ static int __init rtc_init(void)
         if (!mach_hwclk)
                 return -ENODEV;
  
-       pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0);
+       pdev = platform_device_register_data(NULL, "rtc-generic", -1,
+                                            &generic_rtc_ops,
+                                            sizeof(generic_rtc_ops));
         return PTR_ERR_OR_ZERO(pdev);
  }
  
diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c

index 689b47d292acd4d5f44472a2efed3d2d1e94cc0e..2f33a33001e56a0ae048fc791d012db7d18d9b7c 100644 (file)
--- a/arch/m68k/mac/config.c
+++ b/arch/m68k/mac/config.c
@@ -10,6 +10,7 @@
   * Miscellaneous linux stuff
   */
  
+#include <linux/errno.h>
  #include <linux/module.h>
  #include <linux/types.h>
  #include <linux/mm.h>
@@ -25,6 +26,7 @@
  #include <linux/platform_device.h>
  #include <linux/adb.h>
  #include <linux/cuda.h>
+#include <linux/rtc.h>
  
  #include <asm/setup.h>
  #include <asm/bootinfo.h>
@@ -34,7 +36,6 @@
  #include <asm/io.h>
  #include <asm/irq.h>
  #include <asm/pgtable.h>
-#include <asm/rtc.h>
  #include <asm/machdep.h>
  
  #include <asm/macintosh.h>
diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c

index 707b61aea2030c8f7de9058a2546a8eb85ce1639..0fb54a90eac273054ccb7dd618caf69c668e9fe4 100644 (file)
--- a/arch/m68k/mac/misc.c
+++ b/arch/m68k/mac/misc.c
@@ -18,7 +18,6 @@
  
  #include <asm/uaccess.h>
  #include <asm/io.h>
-#include <asm/rtc.h>
  #include <asm/segment.h>
  #include <asm/setup.h>
  #include <asm/macintosh.h>
diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c

index e6a3b56c6481d14651557570fc63cbd8d00eefdb..c11d38dfad08faa12881276e0f41b1b7473d0dec 100644 (file)
--- a/arch/m68k/mvme147/config.c
+++ b/arch/m68k/mvme147/config.c
@@ -32,7 +32,6 @@
  #include <asm/setup.h>
  #include <asm/irq.h>
  #include <asm/traps.h>
-#include <asm/rtc.h>
  #include <asm/machdep.h>
  #include <asm/mvme147hw.h>
  
diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c

index a53803cc66cde8df435b70aac81b381c6bfe63a4..58e240939d265f69abd5c8c578587178f0f0f27d 100644 (file)
--- a/arch/m68k/mvme16x/config.c
+++ b/arch/m68k/mvme16x/config.c
@@ -35,7 +35,6 @@
  #include <asm/setup.h>
  #include <asm/irq.h>
  #include <asm/traps.h>
-#include <asm/rtc.h>
  #include <asm/machdep.h>
  #include <asm/mvme16xhw.h>
  
diff --git a/arch/m68k/q40/config.c b/arch/m68k/q40/config.c

index e90fe903613ead8ad480797d138f7d566f738f48..fcb7f05b60b60a7eb11d1222c8dac1cd93ac60dd 100644 (file)
--- a/arch/m68k/q40/config.c
+++ b/arch/m68k/q40/config.c
@@ -12,6 +12,7 @@
   * for more details.
   */
  
+#include <linux/errno.h>
  #include <linux/types.h>
  #include <linux/kernel.h>
  #include <linux/mm.h>
@@ -27,7 +28,6 @@
  #include <linux/platform_device.h>
  
  #include <asm/io.h>
-#include <asm/rtc.h>
  #include <asm/bootinfo.h>
  #include <asm/pgtable.h>
  #include <asm/setup.h>
diff --git a/arch/m68k/sun3/config.c b/arch/m68k/sun3/config.c

index 71884bf01d7200e6f77dc3d27731c1b6e97b9081..3af34fa3a344ba8509b6184acab3dc93a60b4cd4 100644 (file)
--- a/arch/m68k/sun3/config.c
+++ b/arch/m68k/sun3/config.c
@@ -26,7 +26,6 @@
  #include <asm/pgalloc.h>
  #include <asm/sun3-head.h>
  #include <asm/sun3mmu.h>
-#include <asm/rtc.h>
  #include <asm/machdep.h>
  #include <asm/machines.h>
  #include <asm/idprom.h>
diff --git a/arch/m68k/sun3/intersil.c b/arch/m68k/sun3/intersil.c

index 889829e11f1dbeba727a2ecff18580e416ee94ba..2cd0bcbe6f307909f8b83af0d00b51e0f6bb51a5 100644 (file)
--- a/arch/m68k/sun3/intersil.c
+++ b/arch/m68k/sun3/intersil.c
@@ -14,8 +14,8 @@
  #include <linux/rtc.h>
  
  #include <asm/errno.h>
-#include <asm/rtc.h>
  #include <asm/intersil.h>
+#include <asm/machdep.h>
  
  
  /* bits to set for start/run of the intersil */
diff --git a/arch/m68k/sun3x/time.c b/arch/m68k/sun3x/time.c

index c8eb08add6b0801cb03718d012657f21cdeded6f..431d3c4306dd8da594f485dbb297830dc7f0293d 100644 (file)
--- a/arch/m68k/sun3x/time.c
+++ b/arch/m68k/sun3x/time.c
@@ -15,10 +15,10 @@
  
  #include <asm/irq.h>
  #include <asm/io.h>
+#include <asm/machdep.h>
  #include <asm/traps.h>
  #include <asm/sun3x.h>
  #include <asm/sun3ints.h>
-#include <asm/rtc.h>
  
  #include "time.h"
  
diff --git a/arch/metag/include/asm/cmpxchg_lnkget.h b/arch/metag/include/asm/cmpxchg_lnkget.h

index 0154e2807ebb59b1c6720f140782fc247f4fea2e..2369ad39487607c26ebc5c1f2bb3c61765bd67c1 100644 (file)
--- a/arch/metag/include/asm/cmpxchg_lnkget.h
+++ b/arch/metag/include/asm/cmpxchg_lnkget.h
@@ -73,7 +73,7 @@ static inline unsigned long __cmpxchg_u32(volatile int *m, unsigned long old,
                       " DCACHE  [%2], %0\n"
  #endif
                       "2:\n"
-                     : "=&d" (temp), "=&da" (retval)
+                     : "=&d" (temp), "=&d" (retval)
                       : "da" (m), "bd" (old), "da" (new)
                       : "cc"
                       );
diff --git a/arch/metag/include/asm/metag_mem.h b/arch/metag/include/asm/metag_mem.h

index aa5a076df439a9490f941cd335c7af8c90b62b0c..7848bc6d3b61466ff48264dced902e4393a2dbec 100644 (file)
--- a/arch/metag/include/asm/metag_mem.h
+++ b/arch/metag/include/asm/metag_mem.h
@@ -881,7 +881,7 @@
  #define    PERFCTRL_DCSTALL 11 /* Dcache+TLB o/p delayed (per-thread) */
  #define    PERFCTRL_ICSTALL 12 /* Icache+TLB o/p delayed (per-thread) */
  
-#define    PERFCTRL_INT     13 /* Internal core delailed events (see next) */
+#define    PERFCTRL_INT     13 /* Internal core detailed events (see next) */
  #define    PERFCTRL_EXT     15 /* External source in core periphery */
  #endif /* METAC_2_1 */
  
diff --git a/arch/metag/include/asm/metag_regs.h b/arch/metag/include/asm/metag_regs.h

index 40c3f679c5b8640e60c69149f01567bf19256a26..60b750971d8a12c2a4b1d38463b01034817a4c75 100644 (file)
--- a/arch/metag/include/asm/metag_regs.h
+++ b/arch/metag/include/asm/metag_regs.h
@@ -179,7 +179,7 @@
  ;   is best to dump these registers immediately at the start of a routine
  ;   using a MSETL or SETL instruction-
  ;
-;   MSETL   [A0StP],D0Ar6,D0Ar4,D0Ar2; Only dump argments expected
+;   MSETL   [A0StP],D0Ar6,D0Ar4,D0Ar2; Only dump arguments expected
  ;or SETL    [A0StP+#8++],D0Ar2       ; Up to two 32-bit args expected
  ;
  ; For non-leaf routines it is always necessary to save and restore at least
diff --git a/arch/metag/kernel/cachepart.c b/arch/metag/kernel/cachepart.c

index 04b7d4f8429ac399ba74c34a39722311f052da3b..db944c2e7d889a1c32d55a750e78a2db6dce6905 100644 (file)
--- a/arch/metag/kernel/cachepart.c
+++ b/arch/metag/kernel/cachepart.c
@@ -15,7 +15,7 @@
  #define SYSC_DCPART(n) (SYSC_DCPART0 + SYSC_xCPARTn_STRIDE * (n))
  #define SYSC_ICPART(n) (SYSC_ICPART0 + SYSC_xCPARTn_STRIDE * (n))
  
-#define CACHE_ASSOCIATIVITY 4 /* 4 way set-assosiative */
+#define CACHE_ASSOCIATIVITY 4 /* 4 way set-associative */
  #define ICACHE 0
  #define DCACHE 1
  
diff --git a/arch/metag/lib/divsi3.S b/arch/metag/lib/divsi3.S

index 7c8a8ae9a0a109dc377bdfb6640f651c70013136..11124cc93dee8759bf8eabd51afd53e83418d105 100644 (file)
--- a/arch/metag/lib/divsi3.S
+++ b/arch/metag/lib/divsi3.S
@@ -50,7 +50,7 @@ $LIDMCQuick:
         ADDCC   D0Re0,D0Re0,#1          ! If yes result += 1
         SUBCC   D1Ar1,D1Ar1,D1Re0       !        and A -= Bu
         ORS     D0Ar4,D0Ar4,D0Ar4       ! Return neg result?
-       NEG     D0Ar2,D0Re0             ! Calulate neg result
+       NEG     D0Ar2,D0Re0             ! Calculate neg result
         MOVMI   D0Re0,D0Ar2             ! Yes: Take neg result
  $LIDMCRet:
         MOV     PC,D1RtP
@@ -94,7 +94,7 @@ $LIDMCLoop:
         LSR     D1Re0, D1Re0, #1        ! Shift down B
         BNZ     $LIDMCLoop               ! Was single bit in curbit lost?
         ORS     D0Ar4,D0Ar4,D0Ar4       ! Return neg result?
-       NEG     D0Ar2,D0Re0             ! Calulate neg result
+       NEG     D0Ar2,D0Re0             ! Calculate neg result
         MOVMI   D0Re0,D0Ar2             ! Yes: Take neg result
         MOV     PC,D1RtP
         .size   ___divsi3,.-___divsi3
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c

index 372783a67dda1a457e4dafc9139324133a24d564..c765b3621b9b9ff7ae6304dd63344e8640cc9667 100644 (file)
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -187,7 +187,7 @@ bad_area_nosemaphore:
  
                 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
                     printk_ratelimit()) {
-                       pr_info("%s%s[%d]: segfault at %lx pc %08x sp %08x write %d trap %#x (%s)",
+                       printk("%s%s[%d]: segfault at %lx pc %08x sp %08x write %d trap %#x (%s)",
                                task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
                                tsk->comm, task_pid_nr(tsk), address,
                                regs->ctx.CurrPC, regs->ctx.AX[0].U0,
diff --git a/arch/mips/sgi-ip22/ip22-reset.c b/arch/mips/sgi-ip22/ip22-reset.c

index 063c2dd31e72856fdf83391ddff0721240df6f1b..2f45b035702148c03db45de8004e63e4a783483e 100644 (file)
--- a/arch/mips/sgi-ip22/ip22-reset.c
+++ b/arch/mips/sgi-ip22/ip22-reset.c
@@ -7,7 +7,7 @@
   */
  #include <linux/linkage.h>
  #include <linux/init.h>
-#include <linux/ds1286.h>
+#include <linux/rtc/ds1286.h>
  #include <linux/module.h>
  #include <linux/interrupt.h>
  #include <linux/kernel.h>
diff --git a/arch/mips/sni/time.c b/arch/mips/sni/time.c

index fb4b3520cdc615f507eb90a08e77ba6e06a26903..7ee14f41fc25dc20e0b5556c13e11df281b9ea76 100644 (file)
--- a/arch/mips/sni/time.c
+++ b/arch/mips/sni/time.c
@@ -8,7 +8,6 @@
  
  #include <asm/sni.h>
  #include <asm/time.h>
-#include <asm-generic/rtc.h>
  
  #define SNI_CLOCK_TICK_RATE    3686400
  #define SNI_COUNTER2_DIV       64
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig

index 9627e81a6cbb0c4ad22000fc86922be19f44a265..38e3494bfb63ad0e620f6de3a93190f1e19944ca 100644 (file)
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -236,7 +236,9 @@ source "kernel/Kconfig.hz"
  config MN10300_RTC
         bool "Using MN10300 RTC"
         depends on MN10300_PROC_MN103E010 || MN10300_PROC_MN2WS0050
-       select GENERIC_CMOS_UPDATE
+       select RTC_CLASS
+       select RTC_DRV_CMOS
+       select RTC_SYSTOHC
         default n
         help
           This option enables support for the RTC, thus enabling time to be
diff --git a/arch/mn10300/include/asm/rtc-regs.h b/arch/mn10300/include/asm/rtc-regs.h

index c42deefaec118856539e1cbfb76d6cd08d25202a..c81cacecb6e3883d56fb366403bca001b380d632 100644 (file)
--- a/arch/mn10300/include/asm/rtc-regs.h
+++ b/arch/mn10300/include/asm/rtc-regs.h
@@ -75,9 +75,9 @@
  #define RTC_PORT(x)            0xd8600000
  #define RTC_ALWAYS_BCD         1       /* RTC operates in binary mode */
  
-#define CMOS_READ(addr)                __SYSREG(0xd8600000 + (addr), u8)
+#define CMOS_READ(addr)                __SYSREG(0xd8600000 + (u32)(addr), u8)
  #define CMOS_WRITE(val, addr)  \
-       do { __SYSREG(0xd8600000 + (addr), u8) = val; } while (0)
+       do { __SYSREG(0xd8600000 + (u32)(addr), u8) = val; } while (0)
  
  #define RTC_IRQ                        RTIRQ
  
diff --git a/arch/mn10300/include/asm/rtc.h b/arch/mn10300/include/asm/rtc.h

index 6c14bb1d0d9b8c20f45fff9279052eb43ab113e2..07dc87656197921eb6d1713589be7ba6bb006527 100644 (file)
--- a/arch/mn10300/include/asm/rtc.h
+++ b/arch/mn10300/include/asm/rtc.h
@@ -25,6 +25,4 @@ static inline void calibrate_clock(void)
  
  #endif /* !CONFIG_MN10300_RTC */
  
-#include <asm-generic/rtc.h>
-
  #endif /* _ASM_RTC_H */
diff --git a/arch/mn10300/kernel/rtc.c b/arch/mn10300/kernel/rtc.c

index 48d7058b3295c9c9a4d6623056dc22a74df28588..f81f370250720427d8d570e64ed7ae0469571a3a 100644 (file)
--- a/arch/mn10300/kernel/rtc.c
+++ b/arch/mn10300/kernel/rtc.c
@@ -12,107 +12,19 @@
  #include <linux/module.h>
  #include <linux/init.h>
  #include <linux/mc146818rtc.h>
-#include <linux/bcd.h>
-#include <linux/timex.h>
+#include <linux/ioport.h>
+#include <linux/platform_device.h>
+
  #include <asm/rtc-regs.h>
  #include <asm/rtc.h>
  
  DEFINE_SPINLOCK(rtc_lock);
  EXPORT_SYMBOL(rtc_lock);
  
-/*
- * Read the current RTC time
- */
-void read_persistent_clock(struct timespec *ts)
-{
-       struct rtc_time tm;
-
-       get_rtc_time(&tm);
-
-       ts->tv_nsec = 0;
-       ts->tv_sec = mktime(tm.tm_year, tm.tm_mon, tm.tm_mday,
-                           tm.tm_hour, tm.tm_min, tm.tm_sec);
-
-       /* if rtc is way off in the past, set something reasonable */
-       if (ts->tv_sec < 0)
-               ts->tv_sec = mktime(2009, 1, 1, 12, 0, 0);
-}
-
-/*
- * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
- * ms after the second nowtime has started, because when nowtime is written
- * into the registers of the CMOS clock, it will jump to the next second
- * precisely 500 ms later.  Check the Motorola MC146818A or Dallas DS12887 data
- * sheet for details.
- *
- * BUG: This routine does not handle hour overflow properly; it just
- *      sets the minutes. Usually you'll only notice that after reboot!
- */
-static int set_rtc_mmss(unsigned long nowtime)
-{
-       unsigned char save_control, save_freq_select;
-       int retval = 0;
-       int real_seconds, real_minutes, cmos_minutes;
-
-       /* gets recalled with irq locally disabled */
-       spin_lock(&rtc_lock);
-       save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being
-                                               * set */
-       CMOS_WRITE(save_control | RTC_SET, RTC_CONTROL);
-
-       save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset
-                                                       * prescaler */
-       CMOS_WRITE(save_freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
-
-       cmos_minutes = CMOS_READ(RTC_MINUTES);
-       if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-               cmos_minutes = bcd2bin(cmos_minutes);
-
-       /*
-        * since we're only adjusting minutes and seconds,
-        * don't interfere with hour overflow. This avoids
-        * messing with unknown time zones but requires your
-        * RTC not to be off by more than 15 minutes
-        */
-       real_seconds = nowtime % 60;
-       real_minutes = nowtime / 60;
-       if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
-               /* correct for half hour time zone */
-               real_minutes += 30;
-       real_minutes %= 60;
-
-       if (abs(real_minutes - cmos_minutes) < 30) {
-               if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-                       real_seconds = bin2bcd(real_seconds);
-                       real_minutes = bin2bcd(real_minutes);
-               }
-               CMOS_WRITE(real_seconds, RTC_SECONDS);
-               CMOS_WRITE(real_minutes, RTC_MINUTES);
-       } else {
-               printk_once(KERN_NOTICE
-                      "set_rtc_mmss: can't update from %d to %d\n",
-                      cmos_minutes, real_minutes);
-               retval = -1;
-       }
-
-       /* The following flags have to be released exactly in this order,
-        * otherwise the DS12887 (popular MC146818A clone with integrated
-        * battery and quartz) will not reset the oscillator and will not
-        * update precisely 500 ms later. You won't find this mentioned in
-        * the Dallas Semiconductor data sheets, but who believes data
-        * sheets anyway ...                           -- Markus Kuhn
-        */
-       CMOS_WRITE(save_control, RTC_CONTROL);
-       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
-       spin_unlock(&rtc_lock);
-
-       return retval;
-}
-
-int update_persistent_clock(struct timespec now)
-{
-       return set_rtc_mmss(now.tv_sec);
-}
+static const __initdata struct resource res[] = {
+       DEFINE_RES_IO(RTC_PORT(0), RTC_IO_EXTENT),
+       DEFINE_RES_IRQ(RTC_IRQ),
+};
  
  /*
   * calibrate the TSC clock against the RTC
@@ -129,4 +41,6 @@ void __init calibrate_clock(void)
         RTCRA |= RTCRA_DVR;
         RTCRA &= ~RTCRA_DVR;
         RTCRB &= ~RTCRB_SET;
+
+       platform_device_register_simple("rtc_cmos", -1, res, ARRAY_SIZE(res));
  }
diff --git a/arch/mn10300/proc-mn103e010/proc-init.c b/arch/mn10300/proc-mn103e010/proc-init.c

index 27b97980dca4d15eb05080ceedb92f873009eabd..102d86a6ae5636effea0288b436300fbc8fe1a1e 100644 (file)
--- a/arch/mn10300/proc-mn103e010/proc-init.c
+++ b/arch/mn10300/proc-mn103e010/proc-init.c
@@ -9,7 +9,10 @@
   * 2 of the Licence, or (at your option) any later version.
   */
  #include <linux/kernel.h>
+#include <linux/irq.h>
+#include <asm/cacheflush.h>
  #include <asm/fpu.h>
+#include <asm/irq.h>
  #include <asm/rtc.h>
  #include <asm/busctl-regs.h>
  
diff --git a/arch/mn10300/proc-mn2ws0050/proc-init.c b/arch/mn10300/proc-mn2ws0050/proc-init.c

index ee6d03dbc8d8d98d7ee61d9072b4c7986ee93bf2..950cc8dbb284f91398b0779f2a74af1e463efea9 100644 (file)
--- a/arch/mn10300/proc-mn2ws0050/proc-init.c
+++ b/arch/mn10300/proc-mn2ws0050/proc-init.c
@@ -14,6 +14,7 @@
  #include <linux/delay.h>
  #include <linux/interrupt.h>
  
+#include <asm/cacheflush.h>
  #include <asm/processor.h>
  #include <asm/uaccess.h>
  #include <asm/io.h>
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig

index dc117385ce2e17480f15771fdb96969f7e1971bf..cd87781031653e78693610f7539151537a3c6398 100644 (file)
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -31,6 +31,7 @@ config PARISC
         select TTY # Needed for pdc_cons.c
         select HAVE_DEBUG_STACKOVERFLOW
         select HAVE_ARCH_AUDITSYSCALL
+       select HAVE_ARCH_HASH
         select HAVE_ARCH_SECCOMP_FILTER
         select HAVE_ARCH_TRACEHOOK
         select HAVE_UNSTABLE_SCHED_CLOCK if (SMP || !64BIT)
diff --git a/arch/parisc/include/asm/hash.h b/arch/parisc/include/asm/hash.h

new file mode 100644 (file)

index 0000000..dbe9331
--- /dev/null
+++ b/arch/parisc/include/asm/hash.h
@@ -0,0 +1,146 @@
+#ifndef _ASM_HASH_H
+#define _ASM_HASH_H
+
+/*
+ * HP-PA only implements integer multiply in the FPU.  However, for
+ * integer multiplies by constant, it has a number of shift-and-add
+ * (but no shift-and-subtract, sigh!) instructions that a compiler
+ * can synthesize a code sequence with.
+ *
+ * Unfortunately, GCC isn't very efficient at using them.  For example
+ * it uses three instructions for "x *= 21" when only two are needed.
+ * But we can find a sequence manually.
+ */
+
+#define HAVE_ARCH__HASH_32 1
+
+/*
+ * This is a multiply by GOLDEN_RATIO_32 = 0x61C88647 optimized for the
+ * PA7100 pairing rules.  This is an in-order 2-way superscalar processor.
+ * Only one instruction in a pair may be a shift (by more than 3 bits),
+ * but other than that, simple ALU ops (including shift-and-add by up
+ * to 3 bits) may be paired arbitrarily.
+ *
+ * PA8xxx processors also dual-issue ALU instructions, although with
+ * fewer constraints, so this schedule is good for them, too.
+ *
+ * This 6-step sequence was found by Yevgen Voronenko's implementation
+ * of the Hcub algorithm at http://spiral.ece.cmu.edu/mcm/gen.html.
+ */
+static inline u32 __attribute_const__ __hash_32(u32 x)
+{
+       u32 a, b, c;
+
+       /*
+        * Phase 1: Compute  a = (x << 19) + x,
+        * b = (x << 9) + a, c = (x << 23) + b.
+        */
+       a = x << 19;            /* Two shifts can't be paired */
+       b = x << 9;     a += x;
+       c = x << 23;    b += a;
+                       c += b;
+       /* Phase 2: Return (b<<11) + (c<<6) + (a<<3) - c */
+       b <<= 11;
+       a += c << 3;    b -= c;
+       return (a << 3) + b;
+}
+
+#if BITS_PER_LONG == 64
+
+#define HAVE_ARCH_HASH_64 1
+
+/*
+ * Finding a good shift-and-add chain for GOLDEN_RATIO_64 is tricky,
+ * because available software for the purpose chokes on constants this
+ * large.  (It's mostly designed for compiling FIR filter coefficients
+ * into FPGAs.)
+ *
+ * However, Jason Thong pointed out a work-around.  The Hcub software
+ * (http://spiral.ece.cmu.edu/mcm/gen.html) is designed for *multiple*
+ * constant multiplication, and is good at finding shift-and-add chains
+ * which share common terms.
+ *
+ * Looking at 0x0x61C8864680B583EB in binary:
+ * 0110000111001000100001100100011010000000101101011000001111101011
+ *  \______________/    \__________/       \_______/     \________/
+ *   \____________________________/         \____________________/
+ * you can see the non-zero bits are divided into several well-separated
+ * blocks.  Hcub can find algorithms for those terms separately, which
+ * can then be shifted and added together.
+ *
+ * Dividing the input into 2, 3 or 4 blocks, Hcub can find solutions
+ * with 10, 9 or 8 adds, respectively, making a total of 11 for the
+ * whole number.
+ *
+ * Using just two large blocks, 0xC3910C8D << 31 in the high bits,
+ * and 0xB583EB in the low bits, produces as good an algorithm as any,
+ * and with one more small shift than alternatives.
+ *
+ * The high bits are a larger number and more work to compute, as well
+ * as needing one extra cycle to shift left 31 bits before the final
+ * addition, so they are the critical path for scheduling.  The low bits
+ * can fit into the scheduling slots left over.
+ */
+
+
+/*
+ * This _ASSIGN(dst, src) macro performs "dst = src", but prevents GCC
+ * from inferring anything about the value assigned to "dest".
+ *
+ * This prevents it from mis-optimizing certain sequences.
+ * In particular, gcc is annoyingly eager to combine consecutive shifts.
+ * Given "x <<= 19; y += x; z += x << 1;", GCC will turn this into
+ * "y += x << 19; z += x << 20;" even though the latter sequence needs
+ * an additional instruction and temporary register.
+ *
+ * Because no actual assembly code is generated, this construct is
+ * usefully portable across all GCC platforms, and so can be test-compiled
+ * on non-PA systems.
+ *
+ * In two places, additional unused input dependencies are added.  This
+ * forces GCC's scheduling so it does not rearrange instructions too much.
+ * Because the PA-8xxx is out of order, I'm not sure how much this matters,
+ * but why make it more difficult for the processor than necessary?
+ */
+#define _ASSIGN(dst, src, ...) asm("" : "=r" (dst) : "0" (src), ##__VA_ARGS__)
+
+/*
+ * Multiply by GOLDEN_RATIO_64 = 0x0x61C8864680B583EB using a heavily
+ * optimized shift-and-add sequence.
+ *
+ * Without the final shift, the multiply proper is 19 instructions,
+ * 10 cycles and uses only 4 temporaries.  Whew!
+ *
+ * You are not expected to understand this.
+ */
+static __always_inline u32 __attribute_const__
+hash_64(u64 a, unsigned int bits)
+{
+       u64 b, c, d;
+
+       /*
+        * Encourage GCC to move a dynamic shift to %sar early,
+        * thereby freeing up an additional temporary register.
+        */
+       if (!__builtin_constant_p(bits))
+               asm("" : "=q" (bits) : "0" (64 - bits));
+       else
+               bits = 64 - bits;
+
+       _ASSIGN(b, a*5);        c = a << 13;
+       b = (b << 2) + a;       _ASSIGN(d, a << 17);
+       a = b + (a << 1);       c += d;
+       d = a << 10;            _ASSIGN(a, a << 19);
+       d = a - d;              _ASSIGN(a, a << 4, "X" (d));
+       c += b;                 a += b;
+       d -= c;                 c += a << 1;
+       a += c << 3;            _ASSIGN(b, b << (7+31), "X" (c), "X" (d));
+       a <<= 31;               b += d;
+       a += b;
+       return a >> bits;
+}
+#undef _ASSIGN /* We're a widely-used header file, so don't litter! */
+
+#endif /* BITS_PER_LONG == 64 */
+
+#endif /* _ASM_HASH_H */
diff --git a/arch/parisc/include/asm/mc146818rtc.h b/arch/parisc/include/asm/mc146818rtc.h

deleted file mode 100644 (file)

index adf4163..0000000
--- a/arch/parisc/include/asm/mc146818rtc.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * Machine dependent access functions for RTC registers.
- */
-#ifndef _ASM_MC146818RTC_H
-#define _ASM_MC146818RTC_H
-
-/* empty include file to satisfy the include in genrtc.c */
-
-#endif /* _ASM_MC146818RTC_H */
diff --git a/arch/parisc/include/asm/rtc.h b/arch/parisc/include/asm/rtc.h

deleted file mode 100644 (file)

index 099d641..0000000
--- a/arch/parisc/include/asm/rtc.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* 
- * include/asm-parisc/rtc.h
- *
- * Copyright 2002 Randolph CHung <tausq@debian.org>
- *
- * Based on: include/asm-ppc/rtc.h and the genrtc driver in the
- * 2.4 parisc linux tree
- */
-
-#ifndef __ASM_RTC_H__
-#define __ASM_RTC_H__
-
-#ifdef __KERNEL__
-
-#include <linux/rtc.h>
-
-#include <asm/pdc.h>
-
-#define SECS_PER_HOUR   (60 * 60)
-#define SECS_PER_DAY    (SECS_PER_HOUR * 24)
-
-
-#define RTC_PIE 0x40           /* periodic interrupt enable */
-#define RTC_AIE 0x20           /* alarm interrupt enable */
-#define RTC_UIE 0x10           /* update-finished interrupt enable */
-
-#define RTC_BATT_BAD 0x100     /* battery bad */
-
-/* some dummy definitions */
-#define RTC_SQWE 0x08          /* enable square-wave output */
-#define RTC_DM_BINARY 0x04     /* all time/date values are BCD if clear */
-#define RTC_24H 0x02           /* 24 hour mode - else hours bit 7 means pm */
-#define RTC_DST_EN 0x01                /* auto switch DST - works f. USA only */
-
-# define __isleap(year) \
-  ((year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0))
-
-/* How many days come before each month (0-12).  */
-static const unsigned short int __mon_yday[2][13] =
-{
-       /* Normal years.  */
-       { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 },
-       /* Leap years.  */
-       { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 }
-};
-
-static inline unsigned int get_rtc_time(struct rtc_time *wtime)
-{
-       struct pdc_tod tod_data;
-       long int days, rem, y;
-       const unsigned short int *ip;
-
-       memset(wtime, 0, sizeof(*wtime));
-       if (pdc_tod_read(&tod_data) < 0)
-               return RTC_24H | RTC_BATT_BAD;
-
-       // most of the remainder of this function is:
-//     Copyright (C) 1991, 1993, 1997, 1998 Free Software Foundation, Inc.
-//     This was originally a part of the GNU C Library.
-//      It is distributed under the GPL, and was swiped from offtime.c
-
-
-       days = tod_data.tod_sec / SECS_PER_DAY;
-       rem = tod_data.tod_sec % SECS_PER_DAY;
-
-       wtime->tm_hour = rem / SECS_PER_HOUR;
-       rem %= SECS_PER_HOUR;
-       wtime->tm_min = rem / 60;
-       wtime->tm_sec = rem % 60;
-
-       y = 1970;
-
-#define DIV(a, b) ((a) / (b) - ((a) % (b) < 0))
-#define LEAPS_THRU_END_OF(y) (DIV (y, 4) - DIV (y, 100) + DIV (y, 400))
-
-       while (days < 0 || days >= (__isleap (y) ? 366 : 365))
-       {
-               /* Guess a corrected year, assuming 365 days per year.  */
-               long int yg = y + days / 365 - (days % 365 < 0);
-
-               /* Adjust DAYS and Y to match the guessed year.  */
-               days -= ((yg - y) * 365
-                        + LEAPS_THRU_END_OF (yg - 1)
-                        - LEAPS_THRU_END_OF (y - 1));
-               y = yg;
-       }
-       wtime->tm_year = y - 1900;
-
-       ip = __mon_yday[__isleap(y)];
-       for (y = 11; days < (long int) ip[y]; --y)
-               continue;
-       days -= ip[y];
-       wtime->tm_mon = y;
-       wtime->tm_mday = days + 1;
-
-       return RTC_24H;
-}
-
-static int set_rtc_time(struct rtc_time *wtime)
-{
-       u_int32_t secs;
-
-       secs = mktime(wtime->tm_year + 1900, wtime->tm_mon + 1, wtime->tm_mday, 
-                     wtime->tm_hour, wtime->tm_min, wtime->tm_sec);
-
-       if(pdc_tod_set(secs, 0) < 0)
-               return -1;
-       else
-               return 0;
-
-}
-
-static inline unsigned int get_rtc_ss(void)
-{
-       struct rtc_time h;
-
-       get_rtc_time(&h);
-       return h.tm_sec;
-}
-
-static inline int get_rtc_pll(struct rtc_pll_info *pll)
-{
-       return -EINVAL;
-}
-static inline int set_rtc_pll(struct rtc_pll_info *pll)
-{
-       return -EINVAL;
-}
-
-#endif /* __KERNEL__ */
-#endif /* __ASM_RTC_H__ */
diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c

index 22395901d47bc8a541d4e8ee39ed7d4979031905..e5d71905cad567cc03e22ffdeb7f7295d635b12b 100644 (file)
--- a/arch/parisc/kernel/firmware.c
+++ b/arch/parisc/kernel/firmware.c
@@ -1354,9 +1354,9 @@ int pdc_pat_io_pci_cfg_read(unsigned long pci_addr, int pci_size, u32 *mem_addr)
         retval = mem_pdc_call(PDC_PAT_IO, PDC_PAT_IO_PCI_CONFIG_READ,
                                         __pa(pdc_result), pci_addr, pci_size);
         switch(pci_size) {
-               case 1: *(u8 *) mem_addr =  (u8)  pdc_result[0];
-               case 2: *(u16 *)mem_addr =  (u16) pdc_result[0];
-               case 4: *(u32 *)mem_addr =  (u32) pdc_result[0];
+               case 1: *(u8 *) mem_addr =  (u8)  pdc_result[0]; break;
+               case 2: *(u16 *)mem_addr =  (u16) pdc_result[0]; break;
+               case 4: *(u32 *)mem_addr =  (u32) pdc_result[0]; break;
         }
         spin_unlock_irqrestore(&pdc_lock, flags);
  
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c

index 31ec99a5f1196bbad6ce28aabcba8d111a63644f..505cf1ac5af24ecef4731f845fd6de516ae1f306 100644 (file)
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -12,6 +12,7 @@
   */
  #include <linux/errno.h>
  #include <linux/module.h>
+#include <linux/rtc.h>
  #include <linux/sched.h>
  #include <linux/kernel.h>
  #include <linux/param.h>
@@ -248,14 +249,47 @@ void __init start_cpu_itimer(void)
         per_cpu(cpu_data, cpu).it_value = next_tick;
  }
  
+#if IS_ENABLED(CONFIG_RTC_DRV_GENERIC)
+static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm)
+{
+       struct pdc_tod tod_data;
+
+       memset(tm, 0, sizeof(*tm));
+       if (pdc_tod_read(&tod_data) < 0)
+               return -EOPNOTSUPP;
+
+       /* we treat tod_sec as unsigned, so this can work until year 2106 */
+       rtc_time64_to_tm(tod_data.tod_sec, tm);
+       return rtc_valid_tm(tm);
+}
+
+static int rtc_generic_set_time(struct device *dev, struct rtc_time *tm)
+{
+       time64_t secs = rtc_tm_to_time64(tm);
+
+       if (pdc_tod_set(secs, 0) < 0)
+               return -EOPNOTSUPP;
+
+       return 0;
+}
+
+static const struct rtc_class_ops rtc_generic_ops = {
+       .read_time = rtc_generic_get_time,
+       .set_time = rtc_generic_set_time,
+};
+
  static int __init rtc_init(void)
  {
         struct platform_device *pdev;
  
-       pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0);
+       pdev = platform_device_register_data(NULL, "rtc-generic", -1,
+                                            &rtc_generic_ops,
+                                            sizeof(rtc_generic_ops));
+
         return PTR_ERR_OR_ZERO(pdev);
  }
  device_initcall(rtc_init);
+#endif
  
  void read_persistent_clock(struct timespec *ts)
  {
diff --git a/arch/parisc/lib/iomap.c b/arch/parisc/lib/iomap.c

index fb8e10a4fb39d5184b127a934308df1a7aac7900..eaffbb90aa14dd7e3edb2ec68cbe8e5974ac91b7 100644 (file)
--- a/arch/parisc/lib/iomap.c
+++ b/arch/parisc/lib/iomap.c
@@ -125,22 +125,22 @@ static void ioport_write32r(void __iomem *addr, const void *s, unsigned long n)
  }
  
  static const struct iomap_ops ioport_ops = {
-       ioport_read8,
-       ioport_read16,
-       ioport_read16,
-       ioport_read32,
-       ioport_read32,
-       ioport_write8,
-       ioport_write16,
-       ioport_write16,
-       ioport_write32,
-       ioport_write32,
-       ioport_read8r,
-       ioport_read16r,
-       ioport_read32r,
-       ioport_write8r,
-       ioport_write16r,
-       ioport_write32r,
+       .read8 = ioport_read8,
+       .read16 = ioport_read16,
+       .read16be = ioport_read16,
+       .read32 = ioport_read32,
+       .read32be = ioport_read32,
+       .write8 = ioport_write8,
+       .write16 = ioport_write16,
+       .write16be = ioport_write16,
+       .write32 = ioport_write32,
+       .write32be = ioport_write32,
+       .read8r = ioport_read8r,
+       .read16r = ioport_read16r,
+       .read32r = ioport_read32r,
+       .write8r = ioport_write8r,
+       .write16r = ioport_write16r,
+       .write32r = ioport_write32r,
  };
  
  /* Legacy I/O memory ops */
@@ -244,22 +244,22 @@ static void iomem_write32r(void __iomem *addr, const void *s, unsigned long n)
  }
  
  static const struct iomap_ops iomem_ops = {
-       iomem_read8,
-       iomem_read16,
-       iomem_read16be,
-       iomem_read32,
-       iomem_read32be,
-       iomem_write8,
-       iomem_write16,
-       iomem_write16be,
-       iomem_write32,
-       iomem_write32be,
-       iomem_read8r,
-       iomem_read16r,
-       iomem_read32r,
-       iomem_write8r,
-       iomem_write16r,
-       iomem_write32r,
+       .read8 = iomem_read8,
+       .read16 = iomem_read16,
+       .read16be = iomem_read16be,
+       .read32 = iomem_read32,
+       .read32be = iomem_read32be,
+       .write8 = iomem_write8,
+       .write16 = iomem_write16,
+       .write16be = iomem_write16be,
+       .write32 = iomem_write32,
+       .write32be = iomem_write32be,
+       .read8r = iomem_read8r,
+       .read16r = iomem_read16r,
+       .read32r = iomem_read32r,
+       .write8r = iomem_write8r,
+       .write16r = iomem_write16r,
+       .write32r = iomem_write32r,
  };
  
  static const struct iomap_ops *iomap_ops[8] = {
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug

index 171047822b561dccb1a72e60c8bc957d43c1588e..63292f64b25a384f716ae7aaf08f90d927765ca5 100644 (file)
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -60,6 +60,25 @@ config CODE_PATCHING_SELFTEST
         depends on DEBUG_KERNEL
         default n
  
+config JUMP_LABEL_FEATURE_CHECKS
+       bool "Enable use of jump label for cpu/mmu_has_feature()"
+       depends on JUMP_LABEL
+       default y
+       help
+         Selecting this options enables use of jump labels for some internal
+         feature checks. This should generate more optimal code for those
+         checks.
+
+config JUMP_LABEL_FEATURE_CHECK_DEBUG
+       bool "Do extra check on feature fixup calls"
+       depends on DEBUG_KERNEL && JUMP_LABEL_FEATURE_CHECKS
+       default n
+       help
+         This tries to catch incorrect usage of cpu_has_feature() and
+         mmu_has_feature() in the code.
+
+         If you don't know what this means, say N.
+
  config FTR_FIXUP_SELFTEST
         bool "Run self-tests of the feature-fixup code"
         depends on DEBUG_KERNEL
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h

index 60f47649306fe8297af8d40860dabd1260ef0e25..c45189aa7476f844fc2400901e630c93d53c68c9 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h
@@ -11,4 +11,19 @@ extern unsigned long
  radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                 unsigned long len, unsigned long pgoff,
                                 unsigned long flags);
+
+static inline int hstate_get_psize(struct hstate *hstate)
+{
+       unsigned long shift;
+
+       shift = huge_page_shift(hstate);
+       if (shift == mmu_psize_defs[MMU_PAGE_2M].shift)
+               return MMU_PAGE_2M;
+       else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
+               return MMU_PAGE_1G;
+       else {
+               WARN(1, "Wrong huge page shift\n");
+               return mmu_virtual_psize;
+       }
+}
  #endif
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h

index 5eaf86ac143d943d37e3f58ef725b4dbcccb1d02..287a656ceb5794f728c495dd4e1b841fdc47c8d5 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -24,6 +24,7 @@
  #include <asm/book3s/64/pgtable.h>
  #include <asm/bug.h>
  #include <asm/processor.h>
+#include <asm/cpu_has_feature.h>
  
  /*
   * SLB
@@ -190,6 +191,15 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
         BUG();
  }
  
+static inline unsigned long get_sllp_encoding(int psize)
+{
+       unsigned long sllp;
+
+       sllp = ((mmu_psize_defs[psize].sllp & SLB_VSID_L) >> 6) |
+               ((mmu_psize_defs[psize].sllp & SLB_VSID_LP) >> 4);
+       return sllp;
+}
+
  #endif /* __ASSEMBLY__ */
  
  /*
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h

index d4eda6420523fe9ed5cd4d0bbb7a64ac19c45b7a..8afb0e00f7d9835baaeb7022102f46abdbe11a91 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -23,13 +23,6 @@ struct mmu_psize_def {
  };
  extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
  
-#ifdef CONFIG_PPC_RADIX_MMU
-#define radix_enabled() mmu_has_feature(MMU_FTR_RADIX)
-#else
-#define radix_enabled() (0)
-#endif
-
-
  #endif /* __ASSEMBLY__ */
  
  /* 64-bit classic hash table MMU */
@@ -107,6 +100,9 @@ extern int mmu_vmemmap_psize;
  extern int mmu_io_psize;
  
  /* MMU initialization */
+void mmu_early_init_devtree(void);
+void hash__early_init_devtree(void);
+void radix__early_init_devtree(void);
  extern void radix_init_native(void);
  extern void hash__early_init_mmu(void);
  extern void radix__early_init_mmu(void);
@@ -132,11 +128,15 @@ extern void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
  static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
                                               phys_addr_t first_memblock_size)
  {
-       if (radix_enabled())
+       if (early_radix_enabled())
                 return radix__setup_initial_memory_limit(first_memblock_base,
                                                    first_memblock_size);
         return hash__setup_initial_memory_limit(first_memblock_base,
                                            first_memblock_size);
  }
+
+extern int (*register_process_table)(unsigned long base, unsigned long page_size,
+                                    unsigned long tbl_size);
+
  #endif /* __ASSEMBLY__ */
  #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h

index f12ddf5e8de51f91eb4c49ccb0283b0994bb4415..2f6373144e2c5eb43af53847d8bf6e87ca3c71c0 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
@@ -75,11 +75,6 @@ static inline void hash__flush_tlb_page(struct vm_area_struct *vma,
  {
  }
  
-static inline void hash__flush_tlb_page_nohash(struct vm_area_struct *vma,
-                                          unsigned long vmaddr)
-{
-}
-
  static inline void hash__flush_tlb_range(struct vm_area_struct *vma,
                                      unsigned long start, unsigned long end)
  {
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h

index 00703e7e4c941468c49f9dcdd137d3d6f64971e9..65037762b1205d4aedf10dac62ba9388f15c5edc 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -10,26 +10,32 @@ static inline int mmu_get_ap(int psize)
         return mmu_psize_defs[psize].ap;
  }
  
+extern void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma,
+                                          unsigned long start, unsigned long end);
+extern void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+                                        unsigned long end, int psize);
+extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
+                                      unsigned long start, unsigned long end);
  extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
                             unsigned long end);
  extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
  
  extern void radix__local_flush_tlb_mm(struct mm_struct *mm);
  extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
-extern void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
-                                   unsigned long ap, int nid);
  extern void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr);
+extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+                                             int psize);
  extern void radix__tlb_flush(struct mmu_gather *tlb);
  #ifdef CONFIG_SMP
  extern void radix__flush_tlb_mm(struct mm_struct *mm);
  extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
-extern void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
-                             unsigned long ap, int nid);
  extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr);
+extern void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+                                       int psize);
  #else
  #define radix__flush_tlb_mm(mm)                radix__local_flush_tlb_mm(mm)
  #define radix__flush_tlb_page(vma,addr)        radix__local_flush_tlb_page(vma,addr)
-#define radix___flush_tlb_page(mm,addr,p,i)    radix___local_flush_tlb_page(mm,addr,p,i)
+#define radix__flush_tlb_page_psize(mm,addr,p) radix__local_flush_tlb_page_psize(mm,addr,p)
  #define radix__flush_tlb_pwc(tlb, addr)        radix__local_flush_tlb_pwc(tlb, addr)
  #endif
  extern void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa,
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h

index 96e5769b18b00fe8fbcd7d788c3c94457d15a98a..72b925f97bab7e9314823b221881a69b4279c7bd 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -7,6 +7,25 @@
  #include <asm/book3s/64/tlbflush-hash.h>
  #include <asm/book3s/64/tlbflush-radix.h>
  
+#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
+static inline void flush_pmd_tlb_range(struct vm_area_struct *vma,
+                                      unsigned long start, unsigned long end)
+{
+       if (radix_enabled())
+               return radix__flush_pmd_tlb_range(vma, start, end);
+       return hash__flush_tlb_range(vma, start, end);
+}
+
+#define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
+static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma,
+                                          unsigned long start,
+                                          unsigned long end)
+{
+       if (radix_enabled())
+               return radix__flush_hugetlb_tlb_range(vma, start, end);
+       return hash__flush_tlb_range(vma, start, end);
+}
+
  static inline void flush_tlb_range(struct vm_area_struct *vma,
                                    unsigned long start, unsigned long end)
  {
@@ -38,14 +57,6 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma,
         return hash__local_flush_tlb_page(vma, vmaddr);
  }
  
-static inline void flush_tlb_page_nohash(struct vm_area_struct *vma,
-                                        unsigned long vmaddr)
-{
-       if (radix_enabled())
-               return radix__flush_tlb_page(vma, vmaddr);
-       return hash__flush_tlb_page_nohash(vma, vmaddr);
-}
-
  static inline void tlb_flush(struct mmu_gather *tlb)
  {
         if (radix_enabled())
diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h

index 69fb16d7a8117a9509120ba81991075e19de104d..b77f0364df94059982c709403d9d5ba3affd0b9c 100644 (file)
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -11,6 +11,7 @@
  
  #include <linux/mm.h>
  #include <asm/cputable.h>
+#include <asm/cpu_has_feature.h>
  
  /*
   * No cache flushing is required when address mappings are changed,
diff --git a/arch/powerpc/include/asm/cpu_has_feature.h b/arch/powerpc/include/asm/cpu_has_feature.h

new file mode 100644 (file)

index 0000000..2ef55f8
--- /dev/null
+++ b/arch/powerpc/include/asm/cpu_has_feature.h
@@ -0,0 +1,53 @@
+#ifndef __ASM_POWERPC_CPUFEATURES_H
+#define __ASM_POWERPC_CPUFEATURES_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/bug.h>
+#include <asm/cputable.h>
+
+static inline bool early_cpu_has_feature(unsigned long feature)
+{
+       return !!((CPU_FTRS_ALWAYS & feature) ||
+                 (CPU_FTRS_POSSIBLE & cur_cpu_spec->cpu_features & feature));
+}
+
+#ifdef CONFIG_JUMP_LABEL_FEATURE_CHECKS
+#include <linux/jump_label.h>
+
+#define NUM_CPU_FTR_KEYS       64
+
+extern struct static_key_true cpu_feature_keys[NUM_CPU_FTR_KEYS];
+
+static __always_inline bool cpu_has_feature(unsigned long feature)
+{
+       int i;
+
+       BUILD_BUG_ON(!__builtin_constant_p(feature));
+
+#ifdef CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG
+       if (!static_key_initialized) {
+               printk("Warning! cpu_has_feature() used prior to jump label init!\n");
+               dump_stack();
+               return early_cpu_has_feature(feature);
+       }
+#endif
+
+       if (CPU_FTRS_ALWAYS & feature)
+               return true;
+
+       if (!(CPU_FTRS_POSSIBLE & feature))
+               return false;
+
+       i = __builtin_ctzl(feature);
+       return static_branch_likely(&cpu_feature_keys[i]);
+}
+#else
+static inline bool cpu_has_feature(unsigned long feature)
+{
+       return early_cpu_has_feature(feature);
+}
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* __ASM_POWERPC_CPUFEATURE_H */
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h

index df4fb5faba436e1922ef1bd774b0d17ed814b274..82026b4193413b8a823466bebe021c84657fdb2b 100644 (file)
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -2,6 +2,7 @@
  #define __ASM_POWERPC_CPUTABLE_H
  
  
+#include <linux/types.h>
  #include <asm/asm-compat.h>
  #include <asm/feature-fixups.h>
  #include <uapi/asm/cputable.h>
@@ -122,6 +123,12 @@ extern void do_feature_fixups(unsigned long value, void *fixup_start,
  
  extern const char *powerpc_base_platform;
  
+#ifdef CONFIG_JUMP_LABEL_FEATURE_CHECKS
+extern void cpu_feature_keys_init(void);
+#else
+static inline void cpu_feature_keys_init(void) { }
+#endif
+
  /* TLB flush actions. Used as argument to cpu_spec.flush_tlb() hook */
  enum {
         TLB_INVAL_SCOPE_GLOBAL = 0,     /* invalidate all TLBs */
@@ -576,14 +583,6 @@ enum {
  };
  #endif /* __powerpc64__ */
  
-static inline int cpu_has_feature(unsigned long feature)
-{
-       return (CPU_FTRS_ALWAYS & feature) ||
-              (CPU_FTRS_POSSIBLE
-               & cur_cpu_spec->cpu_features
-               & feature);
-}
-
  #define HBP_NUM 1
  
  #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h

index 2dfd4fc41f3e822496a1a118ac8d1f26b21b72b4..4f60db074725c7aed926fb9903b416d725abf51e 100644 (file)
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -28,6 +28,7 @@ static inline void setup_cputime_one_jiffy(void) { }
  #include <asm/div64.h>
  #include <asm/time.h>
  #include <asm/param.h>
+#include <asm/cpu_has_feature.h>
  
  typedef u64 __nocast cputime_t;
  typedef u64 __nocast cputime64_t;
diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h

index 5fa6b20eba10a5fd34939f0ee2313413ff285aa6..378167377065a01b3f98df459d82e75277b1d6fe 100644 (file)
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -16,6 +16,7 @@
  #include <linux/threads.h>
  
  #include <asm/ppc-opcode.h>
+#include <asm/cpu_has_feature.h>
  
  #define PPC_DBELL_MSG_BRDCAST  (0x04000000)
  #define PPC_DBELL_TYPE(x)      (((x) & 0xf) << (63-36))
diff --git a/arch/powerpc/include/asm/dcr-native.h b/arch/powerpc/include/asm/dcr-native.h

index 4efc11dacb9805013944790c6767d9c1d27c5616..4a2beef7427721eab0129a28696182047c659687 100644 (file)
--- a/arch/powerpc/include/asm/dcr-native.h
+++ b/arch/powerpc/include/asm/dcr-native.h
@@ -24,6 +24,7 @@
  
  #include <linux/spinlock.h>
  #include <asm/cputable.h>
+#include <asm/cpu_has_feature.h>
  
  typedef struct {
         unsigned int base;
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h

index e2d9f4996e5ca030257c008449128283be2bad33..c5517f463ec79c9ddd0bde0df2dd346ea51fe44c 100644 (file)
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -147,7 +147,7 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
  {
         pte_t pte;
         pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
-       flush_tlb_page(vma, addr);
+       flush_hugetlb_page(vma, addr);
  }
  
  static inline int huge_pte_none(pte_t pte)
diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h

index 9af103a2397592013e43adb4c48cd4738c3e6867..9a287e0ac8b16343e61ab9cc146957bfc4fb85b1 100644 (file)
--- a/arch/powerpc/include/asm/jump_label.h
+++ b/arch/powerpc/include/asm/jump_label.h
@@ -22,7 +22,7 @@
  static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
  {
         asm_volatile_goto("1:\n\t"
-                "nop\n\t"
+                "nop # arch_static_branch\n\t"
                  ".pushsection __jump_table,  \"aw\"\n\t"
                  JUMP_ENTRY_TYPE "1b, %l[l_yes], %c0\n\t"
                  ".popsection \n\t"
@@ -36,7 +36,7 @@ l_yes:
  static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
  {
         asm_volatile_goto("1:\n\t"
-                "b %l[l_yes]\n\t"
+                "b %l[l_yes] # arch_static_branch_jump\n\t"
                  ".pushsection __jump_table,  \"aw\"\n\t"
                  JUMP_ENTRY_TYPE "1b, %l[l_yes], %c0\n\t"
                  ".popsection \n\t"
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h

index 1f4497fb5b83adb5797e258509f2cf8edeaf21c4..88d17b4ea9c83177902a5c72a7ac344ba01c55d3 100644 (file)
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -181,8 +181,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
  
         switch (b_psize) {
         case MMU_PAGE_4K:
-               sllp = ((mmu_psize_defs[a_psize].sllp & SLB_VSID_L) >> 6) |
-                       ((mmu_psize_defs[a_psize].sllp & SLB_VSID_LP) >> 4);
+               sllp = get_sllp_encoding(a_psize);
                 rb |= sllp << 5;        /*  AP field */
                 rb |= (va_low & 0x7ff) << 12;   /* remaining 11 bits of AVA */
                 break;
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h

index 76f5398e7152e7e6a0e9d77c39e543d8f892c68a..0420b388dd8327d1ba2682a235342f67b22a3f2d 100644 (file)
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -219,8 +219,6 @@ struct machdep_calls {
  #ifdef CONFIG_ARCH_RANDOM
         int (*get_random_seed)(unsigned long *v);
  #endif
-       int (*register_process_table)(unsigned long base, unsigned long page_size,
-                                     unsigned long tbl_size);
  };
  
  extern void e500_idle(void);
diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h

index fc420cedecae59c6ee90579b67bf817bb2123491..30922f699341bc3fdf57f51eff9921b058140697 100644 (file)
--- a/arch/powerpc/include/asm/mman.h
+++ b/arch/powerpc/include/asm/mman.h
@@ -13,6 +13,7 @@
  
  #include <asm/cputable.h>
  #include <linux/mm.h>
+#include <asm/cpu_has_feature.h>
  
  /*
   * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits()
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h

index 54471228f7b8f5c3517b59ab9e7e46a54644442d..e2fb408f83983617591e3fe3511f9691b76f0b3d 100644 (file)
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -12,7 +12,7 @@
   */
  
  /*
- * First half is MMU families
+ * MMU families
   */
  #define MMU_FTR_HPTE_TABLE             ASM_CONST(0x00000001)
  #define MMU_FTR_TYPE_8xx               ASM_CONST(0x00000002)
@@ -21,9 +21,13 @@
  #define MMU_FTR_TYPE_FSL_E             ASM_CONST(0x00000010)
  #define MMU_FTR_TYPE_47x               ASM_CONST(0x00000020)
  
+/* Radix page table supported and enabled */
+#define MMU_FTR_TYPE_RADIX             ASM_CONST(0x00000040)
+
  /*
- * This is individual features
+ * Individual features below.
   */
+
  /*
   * We need to clear top 16bits of va (from the remaining 64 bits )in
   * tlbie* instructions
@@ -93,11 +97,6 @@
   */
  #define MMU_FTR_1T_SEGMENT             ASM_CONST(0x40000000)
  
-/*
- * Radix page table available
- */
-#define MMU_FTR_RADIX                  ASM_CONST(0x80000000)
-
  /* MMU feature bit sets for various CPUs */
  #define MMU_FTRS_DEFAULT_HPTE_ARCH_V2  \
         MMU_FTR_HPTE_TABLE | MMU_FTR_PPCAS_ARCH_V2
@@ -113,6 +112,7 @@
  #define MMU_FTRS_PA6T          MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
                                 MMU_FTR_CI_LARGE_PAGE | MMU_FTR_NO_SLBIE_B
  #ifndef __ASSEMBLY__
+#include <linux/bug.h>
  #include <asm/cputable.h>
  
  #ifdef CONFIG_PPC_FSL_BOOK3E
@@ -131,20 +131,71 @@ enum {
                 MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE |
                 MMU_FTR_1T_SEGMENT | MMU_FTR_TLBIE_CROP_VA |
  #ifdef CONFIG_PPC_RADIX_MMU
-               MMU_FTR_RADIX |
+               MMU_FTR_TYPE_RADIX |
  #endif
                 0,
  };
  
-static inline int mmu_has_feature(unsigned long feature)
+static inline bool early_mmu_has_feature(unsigned long feature)
  {
-       return (MMU_FTRS_POSSIBLE & cur_cpu_spec->mmu_features & feature);
+       return !!(MMU_FTRS_POSSIBLE & cur_cpu_spec->mmu_features & feature);
+}
+
+#ifdef CONFIG_JUMP_LABEL_FEATURE_CHECKS
+#include <linux/jump_label.h>
+
+#define NUM_MMU_FTR_KEYS       32
+
+extern struct static_key_true mmu_feature_keys[NUM_MMU_FTR_KEYS];
+
+extern void mmu_feature_keys_init(void);
+
+static __always_inline bool mmu_has_feature(unsigned long feature)
+{
+       int i;
+
+       BUILD_BUG_ON(!__builtin_constant_p(feature));
+
+#ifdef CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG
+       if (!static_key_initialized) {
+               printk("Warning! mmu_has_feature() used prior to jump label init!\n");
+               dump_stack();
+               return early_mmu_has_feature(feature);
+       }
+#endif
+
+       if (!(MMU_FTRS_POSSIBLE & feature))
+               return false;
+
+       i = __builtin_ctzl(feature);
+       return static_branch_likely(&mmu_feature_keys[i]);
  }
  
  static inline void mmu_clear_feature(unsigned long feature)
  {
+       int i;
+
+       i = __builtin_ctzl(feature);
         cur_cpu_spec->mmu_features &= ~feature;
+       static_branch_disable(&mmu_feature_keys[i]);
  }
+#else
+
+static inline void mmu_feature_keys_init(void)
+{
+
+}
+
+static inline bool mmu_has_feature(unsigned long feature)
+{
+       return early_mmu_has_feature(feature);
+}
+
+static inline void mmu_clear_feature(unsigned long feature)
+{
+       cur_cpu_spec->mmu_features &= ~feature;
+}
+#endif /* CONFIG_JUMP_LABEL */
  
  extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup;
  
@@ -164,6 +215,28 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
  }
  #endif /* !CONFIG_DEBUG_VM */
  
+#ifdef CONFIG_PPC_RADIX_MMU
+static inline bool radix_enabled(void)
+{
+       return mmu_has_feature(MMU_FTR_TYPE_RADIX);
+}
+
+static inline bool early_radix_enabled(void)
+{
+       return early_mmu_has_feature(MMU_FTR_TYPE_RADIX);
+}
+#else
+static inline bool radix_enabled(void)
+{
+       return false;
+}
+
+static inline bool early_radix_enabled(void)
+{
+       return false;
+}
+#endif
+
  #endif /* !__ASSEMBLY__ */
  
  /* The kernel use the constants below to index in the page sizes array.
@@ -210,6 +283,7 @@ extern void early_init_mmu(void);
  extern void early_init_mmu_secondary(void);
  extern void setup_initial_memory_limit(phys_addr_t first_memblock_base,
                                        phys_addr_t first_memblock_size);
+static inline void mmu_early_init_devtree(void) { }
  #endif /* __ASSEMBLY__ */
  #endif
  
@@ -230,9 +304,5 @@ extern void setup_initial_memory_limit(phys_addr_t first_memblock_base,
  #  include <asm/mmu-8xx.h>
  #endif
  
-#ifndef radix_enabled
-#define radix_enabled() (0)
-#endif
-
  #endif /* __KERNEL__ */
  #endif /* _ASM_POWERPC_MMU_H_ */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h

index 40f3615bf94072ef3f65679166ab5cd58562c49c..f69f40f1519ad344d6faef75f39f73de1ad845bc 100644 (file)
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1256,15 +1256,6 @@ static inline void msr_check_and_clear(unsigned long bits)
                 __msr_check_and_clear(bits);
  }
  
-static inline unsigned long mfvtb (void)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-       if (cpu_has_feature(CPU_FTR_ARCH_207S))
-               return mfspr(SPRN_VTB);
-#endif
-       return 0;
-}
-
  #ifdef __powerpc64__
  #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E)
  #define mftb()         ({unsigned long rval;                           \
diff --git a/arch/powerpc/include/asm/rtc.h b/arch/powerpc/include/asm/rtc.h

deleted file mode 100644 (file)

index f580292..0000000
--- a/arch/powerpc/include/asm/rtc.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Real-time clock definitions and interfaces
- *
- * Author: Tom Rini <trini@mvista.com>
- *
- * 2002 (c) MontaVista, Software, Inc.  This file is licensed under
- * the terms of the GNU General Public License version 2.  This program
- * is licensed "as is" without any warranty of any kind, whether express
- * or implied.
- *
- * Based on:
- * include/asm-m68k/rtc.h
- *
- * Copyright Richard Zidlicky
- * implementation details for genrtc/q40rtc driver
- *
- * And the old drivers/macintosh/rtc.c which was heavily based on:
- * Linux/SPARC Real Time Clock Driver
- * Copyright (C) 1996 Thomas K. Dyas (tdyas@eden.rutgers.edu)
- *
- * With additional work by Paul Mackerras and Franz Sirl.
- */
-
-#ifndef __ASM_POWERPC_RTC_H__
-#define __ASM_POWERPC_RTC_H__
-
-#ifdef __KERNEL__
-
-#include <linux/rtc.h>
-
-#include <asm/machdep.h>
-#include <asm/time.h>
-
-#define RTC_PIE 0x40           /* periodic interrupt enable */
-#define RTC_AIE 0x20           /* alarm interrupt enable */
-#define RTC_UIE 0x10           /* update-finished interrupt enable */
-
-/* some dummy definitions */
-#define RTC_BATT_BAD 0x100     /* battery bad */
-#define RTC_SQWE 0x08          /* enable square-wave output */
-#define RTC_DM_BINARY 0x04     /* all time/date values are BCD if clear */
-#define RTC_24H 0x02           /* 24 hour mode - else hours bit 7 means pm */
-#define RTC_DST_EN 0x01                /* auto switch DST - works f. USA only */
-
-static inline unsigned int get_rtc_time(struct rtc_time *time)
-{
-       if (ppc_md.get_rtc_time)
-               ppc_md.get_rtc_time(time);
-       return RTC_24H;
-}
-
-/* Set the current date and time in the real time clock. */
-static inline int set_rtc_time(struct rtc_time *time)
-{
-       if (ppc_md.set_rtc_time)
-               return ppc_md.set_rtc_time(time);
-       return -EINVAL;
-}
-
-static inline unsigned int get_rtc_ss(void)
-{
-       struct rtc_time h;
-
-       get_rtc_time(&h);
-       return h.tm_sec;
-}
-
-static inline int get_rtc_pll(struct rtc_pll_info *pll)
-{
-       return -EINVAL;
-}
-static inline int set_rtc_pll(struct rtc_pll_info *pll)
-{
-       return -EINVAL;
-}
-
-#endif /* __KERNEL__ */
-#endif /* __ASM_POWERPC_RTC_H__ */
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h

index 17c8380673a60637c61fec5772162bf0ae5523cb..0a74ebe934e1cbcb61105b63d54a959614bca33d 100644 (file)
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -75,6 +75,14 @@ static inline void disable_kernel_spe(void)
  static inline void __giveup_spe(struct task_struct *t) { }
  #endif
  
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+extern void flush_tmregs_to_thread(struct task_struct *);
+#else
+static inline void flush_tmregs_to_thread(struct task_struct *t)
+{
+}
+#endif
+
  static inline void clear_task_ebb(struct task_struct *t)
  {
  #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h

index 09211640a0e0ab7009e29a1063745f2f93e29875..b240666b7bc1e9e11fc183183cbe37ed56de51e2 100644 (file)
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -18,6 +18,7 @@
  #include <linux/percpu.h>
  
  #include <asm/processor.h>
+#include <asm/cpu_has_feature.h>
  
  /* time.c */
  extern unsigned long tb_ticks_per_jiffy;
@@ -103,7 +104,7 @@ static inline u64 get_vtb(void)
  {
  #ifdef CONFIG_PPC_BOOK3S_64
         if (cpu_has_feature(CPU_FTR_ARCH_207S))
-               return mfvtb();
+               return mfspr(SPRN_VTB);
  #endif
         return 0;
  }
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h

index 20733fa518ae135995abfeb161b227d9a105adce..f6f68f73e8581147772bad3100f74ed5950987bd 100644 (file)
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -46,5 +46,18 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
  #endif
  }
  
+#ifdef CONFIG_SMP
+static inline int mm_is_core_local(struct mm_struct *mm)
+{
+       return cpumask_subset(mm_cpumask(mm),
+                             topology_sibling_cpumask(smp_processor_id()));
+}
+#else
+static inline int mm_is_core_local(struct mm_struct *mm)
+{
+       return 1;
+}
+#endif
+
  #endif /* __KERNEL__ */
  #endif /* __ASM_POWERPC_TLB_H */
diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h

index 1b38eea28e5aa488b8be93d9fda4d9767b81b846..13dbcd41885e12d068ecce25975ecbef5d067f0e 100644 (file)
--- a/arch/powerpc/include/asm/tlbflush.h
+++ b/arch/powerpc/include/asm/tlbflush.h
@@ -54,7 +54,6 @@ extern void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
  #define flush_tlb_page(vma,addr)       local_flush_tlb_page(vma,addr)
  #define __flush_tlb_page(mm,addr,p,i)  __local_flush_tlb_page(mm,addr,p,i)
  #endif
-#define flush_tlb_page_nohash(vma,addr)        flush_tlb_page(vma,addr)
  
  #elif defined(CONFIG_PPC_STD_MMU_32)
  
diff --git a/arch/powerpc/include/asm/xor.h b/arch/powerpc/include/asm/xor.h

index 0abb97f3be106700e224fbcc2c37d7ee783f3a72..a36c2069d8ed55d615d0687bcc3577d85f91a93c 100644 (file)
--- a/arch/powerpc/include/asm/xor.h
+++ b/arch/powerpc/include/asm/xor.h
@@ -23,6 +23,7 @@
  #ifdef CONFIG_ALTIVEC
  
  #include <asm/cputable.h>
+#include <asm/cpu_has_feature.h>
  
  void xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
                    unsigned long *v2_in);
diff --git a/arch/powerpc/include/uapi/asm/elf.h b/arch/powerpc/include/uapi/asm/elf.h

index c2d21d11c2d2c0c55554ce4faa153b88d006ea0f..3a9e44c45c7848cac2c8746a3dcac903eb59a682 100644 (file)
--- a/arch/powerpc/include/uapi/asm/elf.h
+++ b/arch/powerpc/include/uapi/asm/elf.h
@@ -91,6 +91,11 @@
  
  #define ELF_NGREG      48      /* includes nip, msr, lr, etc. */
  #define ELF_NFPREG     33      /* includes fpscr */
+#define ELF_NVMX       34      /* includes all vector registers */
+#define ELF_NVSX       32      /* includes all VSX registers */
+#define ELF_NTMSPRREG  3       /* include tfhar, tfiar, texasr */
+#define ELF_NEBB       3       /* includes ebbrr, ebbhr, bescr */
+#define ELF_NPMU       5       /* includes siar, sdar, sier, mmcr2, mmcr0 */
  
  typedef unsigned long elf_greg_t64;
  typedef elf_greg_t64 elf_gregset_t64[ELF_NGREG];
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c

index c7097f933114c0fca94453b2ae6350217d50d4e7..033f3385fa49496d1d2de6837be081f33c3c357e 100644 (file)
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -26,6 +26,7 @@
  #include <asm/emulated_ops.h>
  #include <asm/switch_to.h>
  #include <asm/disassemble.h>
+#include <asm/cpu_has_feature.h>
  
  struct aligninfo {
         unsigned char len;
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c

index d81f826d102967a3db7ad3f3992ee00119233d55..74248ab18e9895e27d07924fca3c8f52f7733f90 100644 (file)
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -15,6 +15,7 @@
  #include <linux/threads.h>
  #include <linux/init.h>
  #include <linux/export.h>
+#include <linux/jump_label.h>
  
  #include <asm/oprofile_impl.h>
  #include <asm/cputable.h>
@@ -2224,3 +2225,39 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
  
         return NULL;
  }
+
+#ifdef CONFIG_JUMP_LABEL_FEATURE_CHECKS
+struct static_key_true cpu_feature_keys[NUM_CPU_FTR_KEYS] = {
+                       [0 ... NUM_CPU_FTR_KEYS - 1] = STATIC_KEY_TRUE_INIT
+};
+EXPORT_SYMBOL_GPL(cpu_feature_keys);
+
+void __init cpu_feature_keys_init(void)
+{
+       int i;
+
+       for (i = 0; i < NUM_CPU_FTR_KEYS; i++) {
+               unsigned long f = 1ul << i;
+
+               if (!(cur_cpu_spec->cpu_features & f))
+                       static_branch_disable(&cpu_feature_keys[i]);
+       }
+}
+
+struct static_key_true mmu_feature_keys[NUM_MMU_FTR_KEYS] = {
+                       [0 ... NUM_MMU_FTR_KEYS - 1] = STATIC_KEY_TRUE_INIT
+};
+EXPORT_SYMBOL_GPL(mmu_feature_keys);
+
+void __init mmu_feature_keys_init(void)
+{
+       int i;
+
+       for (i = 0; i < NUM_MMU_FTR_KEYS; i++) {
+               unsigned long f = 1ul << i;
+
+               if (!(cur_cpu_spec->mmu_features & f))
+                       static_branch_disable(&mmu_feature_keys[i]);
+       }
+}
+#endif
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S

index fcb2887f5a33da492b6700c877f9d5c2e888e4c4..6b8bc0dd09d4a6e769adae004867b839bb6f4c6f 100644 (file)
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -532,7 +532,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
  #ifdef CONFIG_PPC_STD_MMU_64
  BEGIN_MMU_FTR_SECTION
         b       2f
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
  BEGIN_FTR_SECTION
         clrrdi  r6,r8,28        /* get its ESID */
         clrrdi  r9,r1,28        /* get current sp ESID */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S

index 694def6c9d617818bf2a4cf4778a7648b0fb82f5..41091fdf9bd88fbe68d9ef15aeb04c1b59b18e77 100644 (file)
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -940,7 +940,7 @@ BEGIN_MMU_FTR_SECTION
         b       do_hash_page            /* Try to handle as hpte fault */
  MMU_FTR_SECTION_ELSE
         b       handle_page_fault
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX)
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
  
         .align  7
         .globl  h_data_storage_common
@@ -971,7 +971,7 @@ BEGIN_MMU_FTR_SECTION
         b       do_hash_page            /* Try to handle as hpte fault */
  MMU_FTR_SECTION_ELSE
         b       handle_page_fault
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX)
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
  
         STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception)
  
@@ -1392,7 +1392,7 @@ slb_miss_realmode:
  #ifdef CONFIG_PPC_STD_MMU_64
  BEGIN_MMU_FTR_SECTION
         bl      slb_allocate_realmode
-END_MMU_FTR_SECTION_IFCLR(MMU_FTR_RADIX)
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
  #endif
         /* All done -- return from exception. */
  
@@ -1406,7 +1406,7 @@ BEGIN_MMU_FTR_SECTION
         beq-    2f
  FTR_SECTION_ELSE
         b       2f
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX)
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
  
  .machine       push
  .machine       "power4"
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S

index 8a56a51fc0cbc08d5a011cfe4b91a503d0c41eb8..ba79d15f4ddd7c0d8ce946e15098d977a00338fa 100644 (file)
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -572,7 +572,7 @@ common_exit:
  
  BEGIN_MMU_FTR_SECTION
         b       no_segments
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
         /* Restore SLB  from PACA */
         ld      r8,PACA_SLBSHADOWPTR(r13)
  
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c

index ac910d9982df8f70fd12fe7f9c9c5619eb4adf77..08887cf2b20ee4a49d8ad88f4a6c00b1a722d974 100644 (file)
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -75,6 +75,7 @@
  #endif
  #define CREATE_TRACE_POINTS
  #include <asm/trace.h>
+#include <asm/cpu_has_feature.h>
  
  DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
  EXPORT_PER_CPU_SYMBOL(irq_stat);
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c

index 93dae296b6be693d635a56cf2b894c3dc20f8433..fa20060ff7a52e5cb55000d41c7b25b80baca53b 100644 (file)
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -184,7 +184,7 @@ void setup_paca(struct paca_struct *new_paca)
          * if we do a GET_PACA() before the feature fixups have been
          * applied
          */
-       if (cpu_has_feature(CPU_FTR_HVMODE))
+       if (early_cpu_has_feature(CPU_FTR_HVMODE))
                 mtspr(SPRN_SPRG_HPACA, local_paca);
  #endif
         mtspr(SPRN_SPRG_PACA, local_paca);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c

index a8cca88e972f2abb371d869cc26b5d2a11fcee70..58ccf86415b46cd5c2db593424ecde772cd0d959 100644 (file)
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -58,6 +58,7 @@
  #include <asm/code-patching.h>
  #include <asm/exec.h>
  #include <asm/livepatch.h>
+#include <asm/cpu_has_feature.h>
  
  #include <linux/kprobes.h>
  #include <linux/kdebug.h>
@@ -1073,6 +1074,26 @@ static inline void restore_sprs(struct thread_struct *old_thread,
  #endif
  }
  
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void flush_tmregs_to_thread(struct task_struct *tsk)
+{
+       /*
+        * Process self tracing is not yet supported through
+        * ptrace interface. Ptrace generic code should have
+        * prevented this from happening in the first place.
+        * Warn once here with the message, if some how it
+        * is attempted.
+        */
+       WARN_ONCE(tsk == current,
+               "Not expecting ptrace on self: TM regs may be incorrect\n");
+
+       /*
+        * If task is not current, it should have been flushed
+        * already to it's thread_struct during __switch_to().
+        */
+}
+#endif
+
  struct task_struct *__switch_to(struct task_struct *prev,
         struct task_struct *new)
  {
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c

index bae3db791150157b2374ab0f27bed0bc371494cd..b0245bed6f54862d187ef5e12757e872be7fca5c 100644 (file)
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -170,7 +170,7 @@ static struct ibm_pa_feature {
          */
         {CPU_FTR_TM_COMP, 0, 0,
          PPC_FEATURE2_HTM_COMP|PPC_FEATURE2_HTM_NOSC_COMP, 22, 0, 0},
-       {0, MMU_FTR_RADIX, 0, 0,                40, 0, 0},
+       {0, MMU_FTR_TYPE_RADIX, 0, 0,           40, 0, 0},
  };
  
  static void __init scan_features(unsigned long node, const unsigned char *ftrs,
@@ -647,14 +647,6 @@ static void __init early_reserve_mem(void)
  #endif
  }
  
-static bool disable_radix;
-static int __init parse_disable_radix(char *p)
-{
-       disable_radix = true;
-       return 0;
-}
-early_param("disable_radix", parse_disable_radix);
-
  void __init early_init_devtree(void *params)
  {
         phys_addr_t limit;
@@ -744,11 +736,8 @@ void __init early_init_devtree(void *params)
          */
         spinning_secondaries = boot_cpu_count - 1;
  #endif
-       /*
-        * now fixup radix MMU mode based on kernel command line
-        */
-       if (disable_radix)
-               cur_cpu_spec->mmu_features &= ~MMU_FTR_RADIX;
+
+       mmu_early_init_devtree();
  
  #ifdef CONFIG_PPC_POWERNV
         /* Scan and build the list of machine check recoverable ranges */
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c

index 134bee9ac664db9b36b3c0961bd36a1f2d72b720..4f3c5756cc09898f984de4cc6cf6fc7c1ba830ac 100644 (file)
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -64,6 +64,10 @@ struct pt_regs_offset {
         {.name = STR(gpr##num), .offset = offsetof(struct pt_regs, gpr[num])}
  #define REG_OFFSET_END {.name = NULL, .offset = 0}
  
+#define TVSO(f)        (offsetof(struct thread_vr_state, f))
+#define TFSO(f)        (offsetof(struct thread_fp_state, f))
+#define TSO(f) (offsetof(struct thread_struct, f))
+
  static const struct pt_regs_offset regoffset_table[] = {
         GPR_OFFSET_NAME(0),
         GPR_OFFSET_NAME(1),
@@ -181,6 +185,26 @@ static int set_user_msr(struct task_struct *task, unsigned long msr)
         return 0;
  }
  
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static unsigned long get_user_ckpt_msr(struct task_struct *task)
+{
+       return task->thread.ckpt_regs.msr | task->thread.fpexc_mode;
+}
+
+static int set_user_ckpt_msr(struct task_struct *task, unsigned long msr)
+{
+       task->thread.ckpt_regs.msr &= ~MSR_DEBUGCHANGE;
+       task->thread.ckpt_regs.msr |= msr & MSR_DEBUGCHANGE;
+       return 0;
+}
+
+static int set_user_ckpt_trap(struct task_struct *task, unsigned long trap)
+{
+       task->thread.ckpt_regs.trap = trap & 0xfff0;
+       return 0;
+}
+#endif
+
  #ifdef CONFIG_PPC64
  static int get_user_dscr(struct task_struct *task, unsigned long *data)
  {
@@ -358,6 +382,29 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset,
         return ret;
  }
  
+/*
+ * When the transaction is active, 'transact_fp' holds the current running
+ * value of all FPR registers and 'fp_state' holds the last checkpointed
+ * value of all FPR registers for the current transaction. When transaction
+ * is not active 'fp_state' holds the current running state of all the FPR
+ * registers. So this function which returns the current running values of
+ * all the FPR registers, needs to know whether any transaction is active
+ * or not.
+ *
+ * Userspace interface buffer layout:
+ *
+ * struct data {
+ *     u64     fpr[32];
+ *     u64     fpscr;
+ * };
+ *
+ * There are two config options CONFIG_VSX and CONFIG_PPC_TRANSACTIONAL_MEM
+ * which determines the final code in this function. All the combinations of
+ * these two config options are possible except the one below as transactional
+ * memory config pulls in CONFIG_VSX automatically.
+ *
+ *     !defined(CONFIG_VSX) && defined(CONFIG_PPC_TRANSACTIONAL_MEM)
+ */
  static int fpr_get(struct task_struct *target, const struct user_regset *regset,
                    unsigned int pos, unsigned int count,
                    void *kbuf, void __user *ubuf)
@@ -368,14 +415,31 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset,
  #endif
         flush_fp_to_thread(target);
  
-#ifdef CONFIG_VSX
+#if defined(CONFIG_VSX) && defined(CONFIG_PPC_TRANSACTIONAL_MEM)
+       /* copy to local buffer then write that out */
+       if (MSR_TM_ACTIVE(target->thread.regs->msr)) {
+               flush_altivec_to_thread(target);
+               flush_tmregs_to_thread(target);
+               for (i = 0; i < 32 ; i++)
+                       buf[i] = target->thread.TS_TRANS_FPR(i);
+               buf[32] = target->thread.transact_fp.fpscr;
+       } else {
+               for (i = 0; i < 32 ; i++)
+                       buf[i] = target->thread.TS_FPR(i);
+               buf[32] = target->thread.fp_state.fpscr;
+       }
+       return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
+#endif
+
+#if defined(CONFIG_VSX) && !defined(CONFIG_PPC_TRANSACTIONAL_MEM)
         /* copy to local buffer then write that out */
         for (i = 0; i < 32 ; i++)
                 buf[i] = target->thread.TS_FPR(i);
         buf[32] = target->thread.fp_state.fpscr;
         return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
+#endif
  
-#else
+#if !defined(CONFIG_VSX) && !defined(CONFIG_PPC_TRANSACTIONAL_MEM)
         BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) !=
                      offsetof(struct thread_fp_state, fpr[32]));
  
@@ -384,6 +448,29 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset,
  #endif
  }
  
+/*
+ * When the transaction is active, 'transact_fp' holds the current running
+ * value of all FPR registers and 'fp_state' holds the last checkpointed
+ * value of all FPR registers for the current transaction. When transaction
+ * is not active 'fp_state' holds the current running state of all the FPR
+ * registers. So this function which setss the current running values of
+ * all the FPR registers, needs to know whether any transaction is active
+ * or not.
+ *
+ * Userspace interface buffer layout:
+ *
+ * struct data {
+ *     u64     fpr[32];
+ *     u64     fpscr;
+ * };
+ *
+ * There are two config options CONFIG_VSX and CONFIG_PPC_TRANSACTIONAL_MEM
+ * which determines the final code in this function. All the combinations of
+ * these two config options are possible except the one below as transactional
+ * memory config pulls in CONFIG_VSX automatically.
+ *
+ *     !defined(CONFIG_VSX) && defined(CONFIG_PPC_TRANSACTIONAL_MEM)
+ */
  static int fpr_set(struct task_struct *target, const struct user_regset *regset,
                    unsigned int pos, unsigned int count,
                    const void *kbuf, const void __user *ubuf)
@@ -394,7 +481,27 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
  #endif
         flush_fp_to_thread(target);
  
-#ifdef CONFIG_VSX
+#if defined(CONFIG_VSX) && defined(CONFIG_PPC_TRANSACTIONAL_MEM)
+       /* copy to local buffer then write that out */
+       i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
+       if (i)
+               return i;
+
+       if (MSR_TM_ACTIVE(target->thread.regs->msr)) {
+               flush_altivec_to_thread(target);
+               flush_tmregs_to_thread(target);
+               for (i = 0; i < 32 ; i++)
+                       target->thread.TS_TRANS_FPR(i) = buf[i];
+               target->thread.transact_fp.fpscr = buf[32];
+       } else {
+               for (i = 0; i < 32 ; i++)
+                       target->thread.TS_FPR(i) = buf[i];
+               target->thread.fp_state.fpscr = buf[32];
+       }
+       return 0;
+#endif
+
+#if defined(CONFIG_VSX) && !defined(CONFIG_PPC_TRANSACTIONAL_MEM)
         /* copy to local buffer then write that out */
         i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
         if (i)
@@ -403,7 +510,9 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
                 target->thread.TS_FPR(i) = buf[i];
         target->thread.fp_state.fpscr = buf[32];
         return 0;
-#else
+#endif
+
+#if !defined(CONFIG_VSX) && !defined(CONFIG_PPC_TRANSACTIONAL_MEM)
         BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) !=
                      offsetof(struct thread_fp_state, fpr[32]));
  
@@ -433,10 +542,28 @@ static int vr_active(struct task_struct *target,
         return target->thread.used_vr ? regset->n : 0;
  }
  
+/*
+ * When the transaction is active, 'transact_vr' holds the current running
+ * value of all the VMX registers and 'vr_state' holds the last checkpointed
+ * value of all the VMX registers for the current transaction to fall back
+ * on in case it aborts. When transaction is not active 'vr_state' holds
+ * the current running state of all the VMX registers. So this function which
+ * gets the current running values of all the VMX registers, needs to know
+ * whether any transaction is active or not.
+ *
+ * Userspace interface buffer layout:
+ *
+ * struct data {
+ *     vector128       vr[32];
+ *     vector128       vscr;
+ *     vector128       vrsave;
+ * };
+ */
  static int vr_get(struct task_struct *target, const struct user_regset *regset,
                   unsigned int pos, unsigned int count,
                   void *kbuf, void __user *ubuf)
  {
+       struct thread_vr_state *addr;
         int ret;
  
         flush_altivec_to_thread(target);
@@ -444,8 +571,19 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset,
         BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) !=
                      offsetof(struct thread_vr_state, vr[32]));
  
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       if (MSR_TM_ACTIVE(target->thread.regs->msr)) {
+               flush_fp_to_thread(target);
+               flush_tmregs_to_thread(target);
+               addr = &target->thread.transact_vr;
+       } else {
+               addr = &target->thread.vr_state;
+       }
+#else
+       addr = &target->thread.vr_state;
+#endif
         ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-                                 &target->thread.vr_state, 0,
+                                 addr, 0,
                                   33 * sizeof(vector128));
         if (!ret) {
                 /*
@@ -456,7 +594,16 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset,
                         u32 word;
                 } vrsave;
                 memset(&vrsave, 0, sizeof(vrsave));
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+               if (MSR_TM_ACTIVE(target->thread.regs->msr))
+                       vrsave.word = target->thread.transact_vrsave;
+               else
+                       vrsave.word = target->thread.vrsave;
+#else
                 vrsave.word = target->thread.vrsave;
+#endif
+
                 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave,
                                           33 * sizeof(vector128), -1);
         }
@@ -464,10 +611,28 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset,
         return ret;
  }
  
+/*
+ * When the transaction is active, 'transact_vr' holds the current running
+ * value of all the VMX registers and 'vr_state' holds the last checkpointed
+ * value of all the VMX registers for the current transaction to fall back
+ * on in case it aborts. When transaction is not active 'vr_state' holds
+ * the current running state of all the VMX registers. So this function which
+ * sets the current running values of all the VMX registers, needs to know
+ * whether any transaction is active or not.
+ *
+ * Userspace interface buffer layout:
+ *
+ * struct data {
+ *     vector128       vr[32];
+ *     vector128       vscr;
+ *     vector128       vrsave;
+ * };
+ */
  static int vr_set(struct task_struct *target, const struct user_regset *regset,
                   unsigned int pos, unsigned int count,
                   const void *kbuf, const void __user *ubuf)
  {
+       struct thread_vr_state *addr;
         int ret;
  
         flush_altivec_to_thread(target);
@@ -475,8 +640,19 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset,
         BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) !=
                      offsetof(struct thread_vr_state, vr[32]));
  
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       if (MSR_TM_ACTIVE(target->thread.regs->msr)) {
+               flush_fp_to_thread(target);
+               flush_tmregs_to_thread(target);
+               addr = &target->thread.transact_vr;
+       } else {
+               addr = &target->thread.vr_state;
+       }
+#else
+       addr = &target->thread.vr_state;
+#endif
         ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-                                &target->thread.vr_state, 0,
+                                addr, 0,
                                  33 * sizeof(vector128));
         if (!ret && count > 0) {
                 /*
@@ -487,11 +663,28 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset,
                         u32 word;
                 } vrsave;
                 memset(&vrsave, 0, sizeof(vrsave));
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+               if (MSR_TM_ACTIVE(target->thread.regs->msr))
+                       vrsave.word = target->thread.transact_vrsave;
+               else
+                       vrsave.word = target->thread.vrsave;
+#else
                 vrsave.word = target->thread.vrsave;
+#endif
                 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave,
                                          33 * sizeof(vector128), -1);
-               if (!ret)
+               if (!ret) {
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+                       if (MSR_TM_ACTIVE(target->thread.regs->msr))
+                               target->thread.transact_vrsave = vrsave.word;
+                       else
+                               target->thread.vrsave = vrsave.word;
+#else
                         target->thread.vrsave = vrsave.word;
+#endif
+               }
         }
  
         return ret;
@@ -512,6 +705,21 @@ static int vsr_active(struct task_struct *target,
         return target->thread.used_vsr ? regset->n : 0;
  }
  
+/*
+ * When the transaction is active, 'transact_fp' holds the current running
+ * value of all FPR registers and 'fp_state' holds the last checkpointed
+ * value of all FPR registers for the current transaction. When transaction
+ * is not active 'fp_state' holds the current running state of all the FPR
+ * registers. So this function which returns the current running values of
+ * all the FPR registers, needs to know whether any transaction is active
+ * or not.
+ *
+ * Userspace interface buffer layout:
+ *
+ * struct data {
+ *     u64     vsx[32];
+ * };
+ */
  static int vsr_get(struct task_struct *target, const struct user_regset *regset,
                    unsigned int pos, unsigned int count,
                    void *kbuf, void __user *ubuf)
@@ -519,16 +727,47 @@ static int vsr_get(struct task_struct *target, const struct user_regset *regset,
         u64 buf[32];
         int ret, i;
  
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       flush_fp_to_thread(target);
+       flush_altivec_to_thread(target);
+       flush_tmregs_to_thread(target);
+#endif
         flush_vsx_to_thread(target);
  
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       if (MSR_TM_ACTIVE(target->thread.regs->msr)) {
+               for (i = 0; i < 32 ; i++)
+                       buf[i] = target->thread.
+                               transact_fp.fpr[i][TS_VSRLOWOFFSET];
+       } else {
+               for (i = 0; i < 32 ; i++)
+                       buf[i] = target->thread.
+                               fp_state.fpr[i][TS_VSRLOWOFFSET];
+       }
+#else
         for (i = 0; i < 32 ; i++)
                 buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
+#endif
         ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
                                   buf, 0, 32 * sizeof(double));
  
         return ret;
  }
  
+/*
+ * When the transaction is active, 'transact_fp' holds the current running
+ * value of all FPR registers and 'fp_state' holds the last checkpointed
+ * value of all FPR registers for the current transaction. When transaction
+ * is not active 'fp_state' holds the current running state of all the FPR
+ * registers. So this function which sets the current running values of all
+ * the FPR registers, needs to know whether any transaction is active or not.
+ *
+ * Userspace interface buffer layout:
+ *
+ * struct data {
+ *     u64     vsx[32];
+ * };
+ */
  static int vsr_set(struct task_struct *target, const struct user_regset *regset,
                    unsigned int pos, unsigned int count,
                    const void *kbuf, const void __user *ubuf)
@@ -536,12 +775,30 @@ static int vsr_set(struct task_struct *target, const struct user_regset *regset,
         u64 buf[32];
         int ret,i;
  
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       flush_fp_to_thread(target);
+       flush_altivec_to_thread(target);
+       flush_tmregs_to_thread(target);
+#endif
         flush_vsx_to_thread(target);
  
         ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
                                  buf, 0, 32 * sizeof(double));
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       if (MSR_TM_ACTIVE(target->thread.regs->msr)) {
+               for (i = 0; i < 32 ; i++)
+                       target->thread.transact_fp.
+                               fpr[i][TS_VSRLOWOFFSET] = buf[i];
+       } else {
+               for (i = 0; i < 32 ; i++)
+                       target->thread.fp_state.
+                               fpr[i][TS_VSRLOWOFFSET] = buf[i];
+       }
+#else
         for (i = 0; i < 32 ; i++)
                 target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+#endif
  
  
         return ret;
@@ -614,143 +871,1277 @@ static int evr_set(struct task_struct *target, const struct user_regset *regset,
  }
  #endif /* CONFIG_SPE */
  
-
-/*
- * These are our native regset flavors.
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/**
+ * tm_cgpr_active - get active number of registers in CGPR
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ *
+ * This function checks for the active number of available
+ * regisers in transaction checkpointed GPR category.
   */
-enum powerpc_regset {
-       REGSET_GPR,
-       REGSET_FPR,
-#ifdef CONFIG_ALTIVEC
-       REGSET_VMX,
-#endif
-#ifdef CONFIG_VSX
-       REGSET_VSX,
-#endif
-#ifdef CONFIG_SPE
-       REGSET_SPE,
-#endif
-};
-
-static const struct user_regset native_regsets[] = {
-       [REGSET_GPR] = {
-               .core_note_type = NT_PRSTATUS, .n = ELF_NGREG,
-               .size = sizeof(long), .align = sizeof(long),
-               .get = gpr_get, .set = gpr_set
-       },
-       [REGSET_FPR] = {
-               .core_note_type = NT_PRFPREG, .n = ELF_NFPREG,
-               .size = sizeof(double), .align = sizeof(double),
-               .get = fpr_get, .set = fpr_set
-       },
-#ifdef CONFIG_ALTIVEC
-       [REGSET_VMX] = {
-               .core_note_type = NT_PPC_VMX, .n = 34,
-               .size = sizeof(vector128), .align = sizeof(vector128),
-               .active = vr_active, .get = vr_get, .set = vr_set
-       },
-#endif
-#ifdef CONFIG_VSX
-       [REGSET_VSX] = {
-               .core_note_type = NT_PPC_VSX, .n = 32,
-               .size = sizeof(double), .align = sizeof(double),
-               .active = vsr_active, .get = vsr_get, .set = vsr_set
-       },
-#endif
-#ifdef CONFIG_SPE
-       [REGSET_SPE] = {
-               .core_note_type = NT_PPC_SPE, .n = 35,
-               .size = sizeof(u32), .align = sizeof(u32),
-               .active = evr_active, .get = evr_get, .set = evr_set
-       },
-#endif
-};
+static int tm_cgpr_active(struct task_struct *target,
+                         const struct user_regset *regset)
+{
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
  
-static const struct user_regset_view user_ppc_native_view = {
-       .name = UTS_MACHINE, .e_machine = ELF_ARCH, .ei_osabi = ELF_OSABI,
-       .regsets = native_regsets, .n = ARRAY_SIZE(native_regsets)
-};
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return 0;
  
-#ifdef CONFIG_PPC64
-#include <linux/compat.h>
+       return regset->n;
+}
  
-static int gpr32_get(struct task_struct *target,
-                    const struct user_regset *regset,
-                    unsigned int pos, unsigned int count,
-                    void *kbuf, void __user *ubuf)
+/**
+ * tm_cgpr_get - get CGPR registers
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ * @pos:       The buffer position.
+ * @count:     Number of bytes to copy.
+ * @kbuf:      Kernel buffer to copy from.
+ * @ubuf:      User buffer to copy into.
+ *
+ * This function gets transaction checkpointed GPR registers.
+ *
+ * When the transaction is active, 'ckpt_regs' holds all the checkpointed
+ * GPR register values for the current transaction to fall back on if it
+ * aborts in between. This function gets those checkpointed GPR registers.
+ * The userspace interface buffer layout is as follows.
+ *
+ * struct data {
+ *     struct pt_regs ckpt_regs;
+ * };
+ */
+static int tm_cgpr_get(struct task_struct *target,
+                       const struct user_regset *regset,
+                       unsigned int pos, unsigned int count,
+                       void *kbuf, void __user *ubuf)
  {
-       const unsigned long *regs = &target->thread.regs->gpr[0];
-       compat_ulong_t *k = kbuf;
-       compat_ulong_t __user *u = ubuf;
-       compat_ulong_t reg;
-       int i;
+       int ret;
  
-       if (target->thread.regs == NULL)
-               return -EIO;
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
  
-       if (!FULL_REGS(target->thread.regs)) {
-               /* We have a partial register set.  Fill 14-31 with bogus values */
-               for (i = 14; i < 32; i++)
-                       target->thread.regs->gpr[i] = NV_REG_POISON; 
-       }
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
  
-       pos /= sizeof(reg);
-       count /= sizeof(reg);
+       flush_fp_to_thread(target);
+       flush_altivec_to_thread(target);
+       flush_tmregs_to_thread(target);
  
-       if (kbuf)
-               for (; count > 0 && pos < PT_MSR; --count)
-                       *k++ = regs[pos++];
-       else
-               for (; count > 0 && pos < PT_MSR; --count)
-                       if (__put_user((compat_ulong_t) regs[pos++], u++))
-                               return -EFAULT;
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                 &target->thread.ckpt_regs,
+                                 0, offsetof(struct pt_regs, msr));
+       if (!ret) {
+               unsigned long msr = get_user_ckpt_msr(target);
  
-       if (count > 0 && pos == PT_MSR) {
-               reg = get_user_msr(target);
-               if (kbuf)
-                       *k++ = reg;
-               else if (__put_user(reg, u++))
-                       return -EFAULT;
-               ++pos;
-               --count;
+               ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &msr,
+                                         offsetof(struct pt_regs, msr),
+                                         offsetof(struct pt_regs, msr) +
+                                         sizeof(msr));
         }
  
-       if (kbuf)
-               for (; count > 0 && pos < PT_REGS_COUNT; --count)
-                       *k++ = regs[pos++];
-       else
-               for (; count > 0 && pos < PT_REGS_COUNT; --count)
-                       if (__put_user((compat_ulong_t) regs[pos++], u++))
-                               return -EFAULT;
+       BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) !=
+                    offsetof(struct pt_regs, msr) + sizeof(long));
  
-       kbuf = k;
-       ubuf = u;
-       pos *= sizeof(reg);
-       count *= sizeof(reg);
-       return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
-                                       PT_REGS_COUNT * sizeof(reg), -1);
+       if (!ret)
+               ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                         &target->thread.ckpt_regs.orig_gpr3,
+                                         offsetof(struct pt_regs, orig_gpr3),
+                                         sizeof(struct pt_regs));
+       if (!ret)
+               ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+                                              sizeof(struct pt_regs), -1);
+
+       return ret;
  }
  
-static int gpr32_set(struct task_struct *target,
-                    const struct user_regset *regset,
-                    unsigned int pos, unsigned int count,
-                    const void *kbuf, const void __user *ubuf)
+/*
+ * tm_cgpr_set - set the CGPR registers
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ * @pos:       The buffer position.
+ * @count:     Number of bytes to copy.
+ * @kbuf:      Kernel buffer to copy into.
+ * @ubuf:      User buffer to copy from.
+ *
+ * This function sets in transaction checkpointed GPR registers.
+ *
+ * When the transaction is active, 'ckpt_regs' holds the checkpointed
+ * GPR register values for the current transaction to fall back on if it
+ * aborts in between. This function sets those checkpointed GPR registers.
+ * The userspace interface buffer layout is as follows.
+ *
+ * struct data {
+ *     struct pt_regs ckpt_regs;
+ * };
+ */
+static int tm_cgpr_set(struct task_struct *target,
+                       const struct user_regset *regset,
+                       unsigned int pos, unsigned int count,
+                       const void *kbuf, const void __user *ubuf)
  {
-       unsigned long *regs = &target->thread.regs->gpr[0];
-       const compat_ulong_t *k = kbuf;
-       const compat_ulong_t __user *u = ubuf;
-       compat_ulong_t reg;
+       unsigned long reg;
+       int ret;
  
-       if (target->thread.regs == NULL)
-               return -EIO;
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
  
-       CHECK_FULL_REGS(target->thread.regs);
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
  
-       pos /= sizeof(reg);
-       count /= sizeof(reg);
+       flush_fp_to_thread(target);
+       flush_altivec_to_thread(target);
+       flush_tmregs_to_thread(target);
  
-       if (kbuf)
-               for (; count > 0 && pos < PT_MSR; --count)
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                &target->thread.ckpt_regs,
+                                0, PT_MSR * sizeof(reg));
+
+       if (!ret && count > 0) {
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &reg,
+                                        PT_MSR * sizeof(reg),
+                                        (PT_MSR + 1) * sizeof(reg));
+               if (!ret)
+                       ret = set_user_ckpt_msr(target, reg);
+       }
+
+       BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) !=
+                    offsetof(struct pt_regs, msr) + sizeof(long));
+
+       if (!ret)
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                        &target->thread.ckpt_regs.orig_gpr3,
+                                        PT_ORIG_R3 * sizeof(reg),
+                                        (PT_MAX_PUT_REG + 1) * sizeof(reg));
+
+       if (PT_MAX_PUT_REG + 1 < PT_TRAP && !ret)
+               ret = user_regset_copyin_ignore(
+                       &pos, &count, &kbuf, &ubuf,
+                       (PT_MAX_PUT_REG + 1) * sizeof(reg),
+                       PT_TRAP * sizeof(reg));
+
+       if (!ret && count > 0) {
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &reg,
+                                        PT_TRAP * sizeof(reg),
+                                        (PT_TRAP + 1) * sizeof(reg));
+               if (!ret)
+                       ret = set_user_ckpt_trap(target, reg);
+       }
+
+       if (!ret)
+               ret = user_regset_copyin_ignore(
+                       &pos, &count, &kbuf, &ubuf,
+                       (PT_TRAP + 1) * sizeof(reg), -1);
+
+       return ret;
+}
+
+/**
+ * tm_cfpr_active - get active number of registers in CFPR
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ *
+ * This function checks for the active number of available
+ * regisers in transaction checkpointed FPR category.
+ */
+static int tm_cfpr_active(struct task_struct *target,
+                               const struct user_regset *regset)
+{
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return 0;
+
+       return regset->n;
+}
+
+/**
+ * tm_cfpr_get - get CFPR registers
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ * @pos:       The buffer position.
+ * @count:     Number of bytes to copy.
+ * @kbuf:      Kernel buffer to copy from.
+ * @ubuf:      User buffer to copy into.
+ *
+ * This function gets in transaction checkpointed FPR registers.
+ *
+ * When the transaction is active 'fp_state' holds the checkpointed
+ * values for the current transaction to fall back on if it aborts
+ * in between. This function gets those checkpointed FPR registers.
+ * The userspace interface buffer layout is as follows.
+ *
+ * struct data {
+ *     u64     fpr[32];
+ *     u64     fpscr;
+ *};
+ */
+static int tm_cfpr_get(struct task_struct *target,
+                       const struct user_regset *regset,
+                       unsigned int pos, unsigned int count,
+                       void *kbuf, void __user *ubuf)
+{
+       u64 buf[33];
+       int i;
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
+
+       flush_fp_to_thread(target);
+       flush_altivec_to_thread(target);
+       flush_tmregs_to_thread(target);
+
+       /* copy to local buffer then write that out */
+       for (i = 0; i < 32 ; i++)
+               buf[i] = target->thread.TS_FPR(i);
+       buf[32] = target->thread.fp_state.fpscr;
+       return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
+}
+
+/**
+ * tm_cfpr_set - set CFPR registers
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ * @pos:       The buffer position.
+ * @count:     Number of bytes to copy.
+ * @kbuf:      Kernel buffer to copy into.
+ * @ubuf:      User buffer to copy from.
+ *
+ * This function sets in transaction checkpointed FPR registers.
+ *
+ * When the transaction is active 'fp_state' holds the checkpointed
+ * FPR register values for the current transaction to fall back on
+ * if it aborts in between. This function sets these checkpointed
+ * FPR registers. The userspace interface buffer layout is as follows.
+ *
+ * struct data {
+ *     u64     fpr[32];
+ *     u64     fpscr;
+ *};
+ */
+static int tm_cfpr_set(struct task_struct *target,
+                       const struct user_regset *regset,
+                       unsigned int pos, unsigned int count,
+                       const void *kbuf, const void __user *ubuf)
+{
+       u64 buf[33];
+       int i;
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
+
+       flush_fp_to_thread(target);
+       flush_altivec_to_thread(target);
+       flush_tmregs_to_thread(target);
+
+       /* copy to local buffer then write that out */
+       i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
+       if (i)
+               return i;
+       for (i = 0; i < 32 ; i++)
+               target->thread.TS_FPR(i) = buf[i];
+       target->thread.fp_state.fpscr = buf[32];
+       return 0;
+}
+
+/**
+ * tm_cvmx_active - get active number of registers in CVMX
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ *
+ * This function checks for the active number of available
+ * regisers in checkpointed VMX category.
+ */
+static int tm_cvmx_active(struct task_struct *target,
+                               const struct user_regset *regset)
+{
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return 0;
+
+       return regset->n;
+}
+
+/**
+ * tm_cvmx_get - get CMVX registers
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ * @pos:       The buffer position.
+ * @count:     Number of bytes to copy.
+ * @kbuf:      Kernel buffer to copy from.
+ * @ubuf:      User buffer to copy into.
+ *
+ * This function gets in transaction checkpointed VMX registers.
+ *
+ * When the transaction is active 'vr_state' and 'vr_save' hold
+ * the checkpointed values for the current transaction to fall
+ * back on if it aborts in between. The userspace interface buffer
+ * layout is as follows.
+ *
+ * struct data {
+ *     vector128       vr[32];
+ *     vector128       vscr;
+ *     vector128       vrsave;
+ *};
+ */
+static int tm_cvmx_get(struct task_struct *target,
+                       const struct user_regset *regset,
+                       unsigned int pos, unsigned int count,
+                       void *kbuf, void __user *ubuf)
+{
+       int ret;
+
+       BUILD_BUG_ON(TVSO(vscr) != TVSO(vr[32]));
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
+
+       /* Flush the state */
+       flush_fp_to_thread(target);
+       flush_altivec_to_thread(target);
+       flush_tmregs_to_thread(target);
+
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                       &target->thread.vr_state, 0,
+                                       33 * sizeof(vector128));
+       if (!ret) {
+               /*
+                * Copy out only the low-order word of vrsave.
+                */
+               union {
+                       elf_vrreg_t reg;
+                       u32 word;
+               } vrsave;
+               memset(&vrsave, 0, sizeof(vrsave));
+               vrsave.word = target->thread.vrsave;
+               ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave,
+                                               33 * sizeof(vector128), -1);
+       }
+
+       return ret;
+}
+
+/**
+ * tm_cvmx_set - set CMVX registers
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ * @pos:       The buffer position.
+ * @count:     Number of bytes to copy.
+ * @kbuf:      Kernel buffer to copy into.
+ * @ubuf:      User buffer to copy from.
+ *
+ * This function sets in transaction checkpointed VMX registers.
+ *
+ * When the transaction is active 'vr_state' and 'vr_save' hold
+ * the checkpointed values for the current transaction to fall
+ * back on if it aborts in between. The userspace interface buffer
+ * layout is as follows.
+ *
+ * struct data {
+ *     vector128       vr[32];
+ *     vector128       vscr;
+ *     vector128       vrsave;
+ *};
+ */
+static int tm_cvmx_set(struct task_struct *target,
+                       const struct user_regset *regset,
+                       unsigned int pos, unsigned int count,
+                       const void *kbuf, const void __user *ubuf)
+{
+       int ret;
+
+       BUILD_BUG_ON(TVSO(vscr) != TVSO(vr[32]));
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
+
+       flush_fp_to_thread(target);
+       flush_altivec_to_thread(target);
+       flush_tmregs_to_thread(target);
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                       &target->thread.vr_state, 0,
+                                       33 * sizeof(vector128));
+       if (!ret && count > 0) {
+               /*
+                * We use only the low-order word of vrsave.
+                */
+               union {
+                       elf_vrreg_t reg;
+                       u32 word;
+               } vrsave;
+               memset(&vrsave, 0, sizeof(vrsave));
+               vrsave.word = target->thread.vrsave;
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave,
+                                               33 * sizeof(vector128), -1);
+               if (!ret)
+                       target->thread.vrsave = vrsave.word;
+       }
+
+       return ret;
+}
+
+/**
+ * tm_cvsx_active - get active number of registers in CVSX
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ *
+ * This function checks for the active number of available
+ * regisers in transaction checkpointed VSX category.
+ */
+static int tm_cvsx_active(struct task_struct *target,
+                               const struct user_regset *regset)
+{
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return 0;
+
+       flush_vsx_to_thread(target);
+       return target->thread.used_vsr ? regset->n : 0;
+}
+
+/**
+ * tm_cvsx_get - get CVSX registers
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ * @pos:       The buffer position.
+ * @count:     Number of bytes to copy.
+ * @kbuf:      Kernel buffer to copy from.
+ * @ubuf:      User buffer to copy into.
+ *
+ * This function gets in transaction checkpointed VSX registers.
+ *
+ * When the transaction is active 'fp_state' holds the checkpointed
+ * values for the current transaction to fall back on if it aborts
+ * in between. This function gets those checkpointed VSX registers.
+ * The userspace interface buffer layout is as follows.
+ *
+ * struct data {
+ *     u64     vsx[32];
+ *};
+ */
+static int tm_cvsx_get(struct task_struct *target,
+                       const struct user_regset *regset,
+                       unsigned int pos, unsigned int count,
+                       void *kbuf, void __user *ubuf)
+{
+       u64 buf[32];
+       int ret, i;
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
+
+       /* Flush the state */
+       flush_fp_to_thread(target);
+       flush_altivec_to_thread(target);
+       flush_tmregs_to_thread(target);
+       flush_vsx_to_thread(target);
+
+       for (i = 0; i < 32 ; i++)
+               buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                 buf, 0, 32 * sizeof(double));
+
+       return ret;
+}
+
+/**
+ * tm_cvsx_set - set CFPR registers
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ * @pos:       The buffer position.
+ * @count:     Number of bytes to copy.
+ * @kbuf:      Kernel buffer to copy into.
+ * @ubuf:      User buffer to copy from.
+ *
+ * This function sets in transaction checkpointed VSX registers.
+ *
+ * When the transaction is active 'fp_state' holds the checkpointed
+ * VSX register values for the current transaction to fall back on
+ * if it aborts in between. This function sets these checkpointed
+ * FPR registers. The userspace interface buffer layout is as follows.
+ *
+ * struct data {
+ *     u64     vsx[32];
+ *};
+ */
+static int tm_cvsx_set(struct task_struct *target,
+                       const struct user_regset *regset,
+                       unsigned int pos, unsigned int count,
+                       const void *kbuf, const void __user *ubuf)
+{
+       u64 buf[32];
+       int ret, i;
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
+
+       /* Flush the state */
+       flush_fp_to_thread(target);
+       flush_altivec_to_thread(target);
+       flush_tmregs_to_thread(target);
+       flush_vsx_to_thread(target);
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                buf, 0, 32 * sizeof(double));
+       for (i = 0; i < 32 ; i++)
+               target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+
+       return ret;
+}
+
+/**
+ * tm_spr_active - get active number of registers in TM SPR
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ *
+ * This function checks the active number of available
+ * regisers in the transactional memory SPR category.
+ */
+static int tm_spr_active(struct task_struct *target,
+                        const struct user_regset *regset)
+{
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       return regset->n;
+}
+
+/**
+ * tm_spr_get - get the TM related SPR registers
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ * @pos:       The buffer position.
+ * @count:     Number of bytes to copy.
+ * @kbuf:      Kernel buffer to copy from.
+ * @ubuf:      User buffer to copy into.
+ *
+ * This function gets transactional memory related SPR registers.
+ * The userspace interface buffer layout is as follows.
+ *
+ * struct {
+ *     u64             tm_tfhar;
+ *     u64             tm_texasr;
+ *     u64             tm_tfiar;
+ * };
+ */
+static int tm_spr_get(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     void *kbuf, void __user *ubuf)
+{
+       int ret;
+
+       /* Build tests */
+       BUILD_BUG_ON(TSO(tm_tfhar) + sizeof(u64) != TSO(tm_texasr));
+       BUILD_BUG_ON(TSO(tm_texasr) + sizeof(u64) != TSO(tm_tfiar));
+       BUILD_BUG_ON(TSO(tm_tfiar) + sizeof(u64) != TSO(ckpt_regs));
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       /* Flush the states */
+       flush_fp_to_thread(target);
+       flush_altivec_to_thread(target);
+       flush_tmregs_to_thread(target);
+
+       /* TFHAR register */
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tm_tfhar, 0, sizeof(u64));
+
+       /* TEXASR register */
+       if (!ret)
+               ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tm_texasr, sizeof(u64),
+                               2 * sizeof(u64));
+
+       /* TFIAR register */
+       if (!ret)
+               ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tm_tfiar,
+                               2 * sizeof(u64), 3 * sizeof(u64));
+       return ret;
+}
+
+/**
+ * tm_spr_set - set the TM related SPR registers
+ * @target:    The target task.
+ * @regset:    The user regset structure.
+ * @pos:       The buffer position.
+ * @count:     Number of bytes to copy.
+ * @kbuf:      Kernel buffer to copy into.
+ * @ubuf:      User buffer to copy from.
+ *
+ * This function sets transactional memory related SPR registers.
+ * The userspace interface buffer layout is as follows.
+ *
+ * struct {
+ *     u64             tm_tfhar;
+ *     u64             tm_texasr;
+ *     u64             tm_tfiar;
+ * };
+ */
+static int tm_spr_set(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     const void *kbuf, const void __user *ubuf)
+{
+       int ret;
+
+       /* Build tests */
+       BUILD_BUG_ON(TSO(tm_tfhar) + sizeof(u64) != TSO(tm_texasr));
+       BUILD_BUG_ON(TSO(tm_texasr) + sizeof(u64) != TSO(tm_tfiar));
+       BUILD_BUG_ON(TSO(tm_tfiar) + sizeof(u64) != TSO(ckpt_regs));
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       /* Flush the states */
+       flush_fp_to_thread(target);
+       flush_altivec_to_thread(target);
+       flush_tmregs_to_thread(target);
+
+       /* TFHAR register */
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tm_tfhar, 0, sizeof(u64));
+
+       /* TEXASR register */
+       if (!ret)
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tm_texasr, sizeof(u64),
+                               2 * sizeof(u64));
+
+       /* TFIAR register */
+       if (!ret)
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tm_tfiar,
+                                2 * sizeof(u64), 3 * sizeof(u64));
+       return ret;
+}
+
+static int tm_tar_active(struct task_struct *target,
+                        const struct user_regset *regset)
+{
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (MSR_TM_ACTIVE(target->thread.regs->msr))
+               return regset->n;
+
+       return 0;
+}
+
+static int tm_tar_get(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     void *kbuf, void __user *ubuf)
+{
+       int ret;
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
+
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tm_tar, 0, sizeof(u64));
+       return ret;
+}
+
+static int tm_tar_set(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     const void *kbuf, const void __user *ubuf)
+{
+       int ret;
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tm_tar, 0, sizeof(u64));
+       return ret;
+}
+
+static int tm_ppr_active(struct task_struct *target,
+                        const struct user_regset *regset)
+{
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (MSR_TM_ACTIVE(target->thread.regs->msr))
+               return regset->n;
+
+       return 0;
+}
+
+
+static int tm_ppr_get(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     void *kbuf, void __user *ubuf)
+{
+       int ret;
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
+
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tm_ppr, 0, sizeof(u64));
+       return ret;
+}
+
+static int tm_ppr_set(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     const void *kbuf, const void __user *ubuf)
+{
+       int ret;
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tm_ppr, 0, sizeof(u64));
+       return ret;
+}
+
+static int tm_dscr_active(struct task_struct *target,
+                        const struct user_regset *regset)
+{
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (MSR_TM_ACTIVE(target->thread.regs->msr))
+               return regset->n;
+
+       return 0;
+}
+
+static int tm_dscr_get(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     void *kbuf, void __user *ubuf)
+{
+       int ret;
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
+
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tm_dscr, 0, sizeof(u64));
+       return ret;
+}
+
+static int tm_dscr_set(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     const void *kbuf, const void __user *ubuf)
+{
+       int ret;
+
+       if (!cpu_has_feature(CPU_FTR_TM))
+               return -ENODEV;
+
+       if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+               return -ENODATA;
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tm_dscr, 0, sizeof(u64));
+       return ret;
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+#ifdef CONFIG_PPC64
+static int ppr_get(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     void *kbuf, void __user *ubuf)
+{
+       int ret;
+
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.ppr, 0, sizeof(u64));
+       return ret;
+}
+
+static int ppr_set(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     const void *kbuf, const void __user *ubuf)
+{
+       int ret;
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.ppr, 0, sizeof(u64));
+       return ret;
+}
+
+static int dscr_get(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     void *kbuf, void __user *ubuf)
+{
+       int ret;
+
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.dscr, 0, sizeof(u64));
+       return ret;
+}
+static int dscr_set(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     const void *kbuf, const void __user *ubuf)
+{
+       int ret;
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.dscr, 0, sizeof(u64));
+       return ret;
+}
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+static int tar_get(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     void *kbuf, void __user *ubuf)
+{
+       int ret;
+
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tar, 0, sizeof(u64));
+       return ret;
+}
+static int tar_set(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     const void *kbuf, const void __user *ubuf)
+{
+       int ret;
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                               &target->thread.tar, 0, sizeof(u64));
+       return ret;
+}
+
+static int ebb_active(struct task_struct *target,
+                        const struct user_regset *regset)
+{
+       if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+               return -ENODEV;
+
+       if (target->thread.used_ebb)
+               return regset->n;
+
+       return 0;
+}
+
+static int ebb_get(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     void *kbuf, void __user *ubuf)
+{
+       /* Build tests */
+       BUILD_BUG_ON(TSO(ebbrr) + sizeof(unsigned long) != TSO(ebbhr));
+       BUILD_BUG_ON(TSO(ebbhr) + sizeof(unsigned long) != TSO(bescr));
+
+       if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+               return -ENODEV;
+
+       if (!target->thread.used_ebb)
+               return -ENODATA;
+
+       return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                       &target->thread.ebbrr, 0, 3 * sizeof(unsigned long));
+}
+
+static int ebb_set(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     const void *kbuf, const void __user *ubuf)
+{
+       int ret = 0;
+
+       /* Build tests */
+       BUILD_BUG_ON(TSO(ebbrr) + sizeof(unsigned long) != TSO(ebbhr));
+       BUILD_BUG_ON(TSO(ebbhr) + sizeof(unsigned long) != TSO(bescr));
+
+       if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+               return -ENODEV;
+
+       if (target->thread.used_ebb)
+               return -ENODATA;
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                       &target->thread.ebbrr, 0, sizeof(unsigned long));
+
+       if (!ret)
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                       &target->thread.ebbhr, sizeof(unsigned long),
+                       2 * sizeof(unsigned long));
+
+       if (!ret)
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                       &target->thread.bescr,
+                       2 * sizeof(unsigned long), 3 * sizeof(unsigned long));
+
+       return ret;
+}
+static int pmu_active(struct task_struct *target,
+                        const struct user_regset *regset)
+{
+       if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+               return -ENODEV;
+
+       return regset->n;
+}
+
+static int pmu_get(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     void *kbuf, void __user *ubuf)
+{
+       /* Build tests */
+       BUILD_BUG_ON(TSO(siar) + sizeof(unsigned long) != TSO(sdar));
+       BUILD_BUG_ON(TSO(sdar) + sizeof(unsigned long) != TSO(sier));
+       BUILD_BUG_ON(TSO(sier) + sizeof(unsigned long) != TSO(mmcr2));
+       BUILD_BUG_ON(TSO(mmcr2) + sizeof(unsigned long) != TSO(mmcr0));
+
+       if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+               return -ENODEV;
+
+       return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                       &target->thread.siar, 0,
+                       5 * sizeof(unsigned long));
+}
+
+static int pmu_set(struct task_struct *target,
+                     const struct user_regset *regset,
+                     unsigned int pos, unsigned int count,
+                     const void *kbuf, const void __user *ubuf)
+{
+       int ret = 0;
+
+       /* Build tests */
+       BUILD_BUG_ON(TSO(siar) + sizeof(unsigned long) != TSO(sdar));
+       BUILD_BUG_ON(TSO(sdar) + sizeof(unsigned long) != TSO(sier));
+       BUILD_BUG_ON(TSO(sier) + sizeof(unsigned long) != TSO(mmcr2));
+       BUILD_BUG_ON(TSO(mmcr2) + sizeof(unsigned long) != TSO(mmcr0));
+
+       if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+               return -ENODEV;
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                       &target->thread.siar, 0,
+                       sizeof(unsigned long));
+
+       if (!ret)
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                       &target->thread.sdar, sizeof(unsigned long),
+                       2 * sizeof(unsigned long));
+
+       if (!ret)
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                       &target->thread.sier, 2 * sizeof(unsigned long),
+                       3 * sizeof(unsigned long));
+
+       if (!ret)
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                       &target->thread.mmcr2, 3 * sizeof(unsigned long),
+                       4 * sizeof(unsigned long));
+
+       if (!ret)
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                       &target->thread.mmcr0, 4 * sizeof(unsigned long),
+                       5 * sizeof(unsigned long));
+       return ret;
+}
+#endif
+/*
+ * These are our native regset flavors.
+ */
+enum powerpc_regset {
+       REGSET_GPR,
+       REGSET_FPR,
+#ifdef CONFIG_ALTIVEC
+       REGSET_VMX,
+#endif
+#ifdef CONFIG_VSX
+       REGSET_VSX,
+#endif
+#ifdef CONFIG_SPE
+       REGSET_SPE,
+#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       REGSET_TM_CGPR,         /* TM checkpointed GPR registers */
+       REGSET_TM_CFPR,         /* TM checkpointed FPR registers */
+       REGSET_TM_CVMX,         /* TM checkpointed VMX registers */
+       REGSET_TM_CVSX,         /* TM checkpointed VSX registers */
+       REGSET_TM_SPR,          /* TM specific SPR registers */
+       REGSET_TM_CTAR,         /* TM checkpointed TAR register */
+       REGSET_TM_CPPR,         /* TM checkpointed PPR register */
+       REGSET_TM_CDSCR,        /* TM checkpointed DSCR register */
+#endif
+#ifdef CONFIG_PPC64
+       REGSET_PPR,             /* PPR register */
+       REGSET_DSCR,            /* DSCR register */
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+       REGSET_TAR,             /* TAR register */
+       REGSET_EBB,             /* EBB registers */
+       REGSET_PMR,             /* Performance Monitor Registers */
+#endif
+};
+
+static const struct user_regset native_regsets[] = {
+       [REGSET_GPR] = {
+               .core_note_type = NT_PRSTATUS, .n = ELF_NGREG,
+               .size = sizeof(long), .align = sizeof(long),
+               .get = gpr_get, .set = gpr_set
+       },
+       [REGSET_FPR] = {
+               .core_note_type = NT_PRFPREG, .n = ELF_NFPREG,
+               .size = sizeof(double), .align = sizeof(double),
+               .get = fpr_get, .set = fpr_set
+       },
+#ifdef CONFIG_ALTIVEC
+       [REGSET_VMX] = {
+               .core_note_type = NT_PPC_VMX, .n = 34,
+               .size = sizeof(vector128), .align = sizeof(vector128),
+               .active = vr_active, .get = vr_get, .set = vr_set
+       },
+#endif
+#ifdef CONFIG_VSX
+       [REGSET_VSX] = {
+               .core_note_type = NT_PPC_VSX, .n = 32,
+               .size = sizeof(double), .align = sizeof(double),
+               .active = vsr_active, .get = vsr_get, .set = vsr_set
+       },
+#endif
+#ifdef CONFIG_SPE
+       [REGSET_SPE] = {
+               .core_note_type = NT_PPC_SPE, .n = 35,
+               .size = sizeof(u32), .align = sizeof(u32),
+               .active = evr_active, .get = evr_get, .set = evr_set
+       },
+#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       [REGSET_TM_CGPR] = {
+               .core_note_type = NT_PPC_TM_CGPR, .n = ELF_NGREG,
+               .size = sizeof(long), .align = sizeof(long),
+               .active = tm_cgpr_active, .get = tm_cgpr_get, .set = tm_cgpr_set
+       },
+       [REGSET_TM_CFPR] = {
+               .core_note_type = NT_PPC_TM_CFPR, .n = ELF_NFPREG,
+               .size = sizeof(double), .align = sizeof(double),
+               .active = tm_cfpr_active, .get = tm_cfpr_get, .set = tm_cfpr_set
+       },
+       [REGSET_TM_CVMX] = {
+               .core_note_type = NT_PPC_TM_CVMX, .n = ELF_NVMX,
+               .size = sizeof(vector128), .align = sizeof(vector128),
+               .active = tm_cvmx_active, .get = tm_cvmx_get, .set = tm_cvmx_set
+       },
+       [REGSET_TM_CVSX] = {
+               .core_note_type = NT_PPC_TM_CVSX, .n = ELF_NVSX,
+               .size = sizeof(double), .align = sizeof(double),
+               .active = tm_cvsx_active, .get = tm_cvsx_get, .set = tm_cvsx_set
+       },
+       [REGSET_TM_SPR] = {
+               .core_note_type = NT_PPC_TM_SPR, .n = ELF_NTMSPRREG,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .active = tm_spr_active, .get = tm_spr_get, .set = tm_spr_set
+       },
+       [REGSET_TM_CTAR] = {
+               .core_note_type = NT_PPC_TM_CTAR, .n = 1,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .active = tm_tar_active, .get = tm_tar_get, .set = tm_tar_set
+       },
+       [REGSET_TM_CPPR] = {
+               .core_note_type = NT_PPC_TM_CPPR, .n = 1,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .active = tm_ppr_active, .get = tm_ppr_get, .set = tm_ppr_set
+       },
+       [REGSET_TM_CDSCR] = {
+               .core_note_type = NT_PPC_TM_CDSCR, .n = 1,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .active = tm_dscr_active, .get = tm_dscr_get, .set = tm_dscr_set
+       },
+#endif
+#ifdef CONFIG_PPC64
+       [REGSET_PPR] = {
+               .core_note_type = NT_PPC_PPR, .n = 1,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .get = ppr_get, .set = ppr_set
+       },
+       [REGSET_DSCR] = {
+               .core_note_type = NT_PPC_DSCR, .n = 1,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .get = dscr_get, .set = dscr_set
+       },
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+       [REGSET_TAR] = {
+               .core_note_type = NT_PPC_TAR, .n = 1,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .get = tar_get, .set = tar_set
+       },
+       [REGSET_EBB] = {
+               .core_note_type = NT_PPC_EBB, .n = ELF_NEBB,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .active = ebb_active, .get = ebb_get, .set = ebb_set
+       },
+       [REGSET_PMR] = {
+               .core_note_type = NT_PPC_PMU, .n = ELF_NPMU,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .active = pmu_active, .get = pmu_get, .set = pmu_set
+       },
+#endif
+};
+
+static const struct user_regset_view user_ppc_native_view = {
+       .name = UTS_MACHINE, .e_machine = ELF_ARCH, .ei_osabi = ELF_OSABI,
+       .regsets = native_regsets, .n = ARRAY_SIZE(native_regsets)
+};
+
+#ifdef CONFIG_PPC64
+#include <linux/compat.h>
+
+static int gpr32_get_common(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                           void *kbuf, void __user *ubuf, bool tm_active)
+{
+       const unsigned long *regs = &target->thread.regs->gpr[0];
+       const unsigned long *ckpt_regs;
+       compat_ulong_t *k = kbuf;
+       compat_ulong_t __user *u = ubuf;
+       compat_ulong_t reg;
+       int i;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       ckpt_regs = &target->thread.ckpt_regs.gpr[0];
+#endif
+       if (tm_active) {
+               regs = ckpt_regs;
+       } else {
+               if (target->thread.regs == NULL)
+                       return -EIO;
+
+               if (!FULL_REGS(target->thread.regs)) {
+                       /*
+                        * We have a partial register set.
+                        * Fill 14-31 with bogus values.
+                        */
+                       for (i = 14; i < 32; i++)
+                               target->thread.regs->gpr[i] = NV_REG_POISON;
+               }
+       }
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < PT_MSR; --count)
+                       *k++ = regs[pos++];
+       else
+               for (; count > 0 && pos < PT_MSR; --count)
+                       if (__put_user((compat_ulong_t) regs[pos++], u++))
+                               return -EFAULT;
+
+       if (count > 0 && pos == PT_MSR) {
+               reg = get_user_msr(target);
+               if (kbuf)
+                       *k++ = reg;
+               else if (__put_user(reg, u++))
+                       return -EFAULT;
+               ++pos;
+               --count;
+       }
+
+       if (kbuf)
+               for (; count > 0 && pos < PT_REGS_COUNT; --count)
+                       *k++ = regs[pos++];
+       else
+               for (; count > 0 && pos < PT_REGS_COUNT; --count)
+                       if (__put_user((compat_ulong_t) regs[pos++], u++))
+                               return -EFAULT;
+
+       kbuf = k;
+       ubuf = u;
+       pos *= sizeof(reg);
+       count *= sizeof(reg);
+       return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+                                       PT_REGS_COUNT * sizeof(reg), -1);
+}
+
+static int gpr32_set_common(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    const void *kbuf, const void __user *ubuf, bool tm_active)
+{
+       unsigned long *regs = &target->thread.regs->gpr[0];
+       unsigned long *ckpt_regs;
+       const compat_ulong_t *k = kbuf;
+       const compat_ulong_t __user *u = ubuf;
+       compat_ulong_t reg;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       ckpt_regs = &target->thread.ckpt_regs.gpr[0];
+#endif
+
+       if (tm_active) {
+               regs = ckpt_regs;
+       } else {
+               regs = &target->thread.regs->gpr[0];
+
+               if (target->thread.regs == NULL)
+                       return -EIO;
+
+               CHECK_FULL_REGS(target->thread.regs);
+       }
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < PT_MSR; --count)
                         regs[pos++] = *k++;
         else
                 for (; count > 0 && pos < PT_MSR; --count) {
@@ -804,6 +2195,40 @@ static int gpr32_set(struct task_struct *target,
                                          (PT_TRAP + 1) * sizeof(reg), -1);
  }
  
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static int tm_cgpr32_get(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    void *kbuf, void __user *ubuf)
+{
+       return gpr32_get_common(target, regset, pos, count, kbuf, ubuf, 1);
+}
+
+static int tm_cgpr32_set(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    const void *kbuf, const void __user *ubuf)
+{
+       return gpr32_set_common(target, regset, pos, count, kbuf, ubuf, 1);
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+static int gpr32_get(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    void *kbuf, void __user *ubuf)
+{
+       return gpr32_get_common(target, regset, pos, count, kbuf, ubuf, 0);
+}
+
+static int gpr32_set(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    const void *kbuf, const void __user *ubuf)
+{
+       return gpr32_set_common(target, regset, pos, count, kbuf, ubuf, 0);
+}
+
  /*
   * These are the regset flavors matching the CONFIG_PPC32 native set.
   */
@@ -832,6 +2257,73 @@ static const struct user_regset compat_regsets[] = {
                 .active = evr_active, .get = evr_get, .set = evr_set
         },
  #endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       [REGSET_TM_CGPR] = {
+               .core_note_type = NT_PPC_TM_CGPR, .n = ELF_NGREG,
+               .size = sizeof(long), .align = sizeof(long),
+               .active = tm_cgpr_active,
+               .get = tm_cgpr32_get, .set = tm_cgpr32_set
+       },
+       [REGSET_TM_CFPR] = {
+               .core_note_type = NT_PPC_TM_CFPR, .n = ELF_NFPREG,
+               .size = sizeof(double), .align = sizeof(double),
+               .active = tm_cfpr_active, .get = tm_cfpr_get, .set = tm_cfpr_set
+       },
+       [REGSET_TM_CVMX] = {
+               .core_note_type = NT_PPC_TM_CVMX, .n = ELF_NVMX,
+               .size = sizeof(vector128), .align = sizeof(vector128),
+               .active = tm_cvmx_active, .get = tm_cvmx_get, .set = tm_cvmx_set
+       },
+       [REGSET_TM_CVSX] = {
+               .core_note_type = NT_PPC_TM_CVSX, .n = ELF_NVSX,
+               .size = sizeof(double), .align = sizeof(double),
+               .active = tm_cvsx_active, .get = tm_cvsx_get, .set = tm_cvsx_set
+       },
+       [REGSET_TM_SPR] = {
+               .core_note_type = NT_PPC_TM_SPR, .n = ELF_NTMSPRREG,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .active = tm_spr_active, .get = tm_spr_get, .set = tm_spr_set
+       },
+       [REGSET_TM_CTAR] = {
+               .core_note_type = NT_PPC_TM_CTAR, .n = 1,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .active = tm_tar_active, .get = tm_tar_get, .set = tm_tar_set
+       },
+       [REGSET_TM_CPPR] = {
+               .core_note_type = NT_PPC_TM_CPPR, .n = 1,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .active = tm_ppr_active, .get = tm_ppr_get, .set = tm_ppr_set
+       },
+       [REGSET_TM_CDSCR] = {
+               .core_note_type = NT_PPC_TM_CDSCR, .n = 1,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .active = tm_dscr_active, .get = tm_dscr_get, .set = tm_dscr_set
+       },
+#endif
+#ifdef CONFIG_PPC64
+       [REGSET_PPR] = {
+               .core_note_type = NT_PPC_PPR, .n = 1,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .get = ppr_get, .set = ppr_set
+       },
+       [REGSET_DSCR] = {
+               .core_note_type = NT_PPC_DSCR, .n = 1,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .get = dscr_get, .set = dscr_set
+       },
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+       [REGSET_TAR] = {
+               .core_note_type = NT_PPC_TAR, .n = 1,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .get = tar_get, .set = tar_set
+       },
+       [REGSET_EBB] = {
+               .core_note_type = NT_PPC_EBB, .n = ELF_NEBB,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .active = ebb_active, .get = ebb_get, .set = ebb_set
+       },
+#endif
  };
  
  static const struct user_regset_view user_ppc_compat_view = {
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c

index 714b4ba7ab86e637dbc695879e372ea32be06c68..dba265c586df010fea218aa2c3d54102c5dbb708 100644 (file)
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -66,6 +66,7 @@
  #include <asm/hugetlb.h>
  #include <asm/livepatch.h>
  #include <asm/mmu_context.h>
+#include <asm/cpu_has_feature.h>
  
  #include "setup.h"
  
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c

index 00f57754407ec2bb56177dc0d77619e7214a78ff..c3e861df4b203ce5be8e0e9f0fb8f4e236d41cd5 100644 (file)
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -37,6 +37,7 @@
  #include <asm/serial.h>
  #include <asm/udbg.h>
  #include <asm/code-patching.h>
+#include <asm/cpu_has_feature.h>
  
  #define DBG(fmt...)
  
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c

index d8216aed22b755a6b42873ded30e5cf2b4477ea8..eafb9a79e0116b600624a16212c2a02bfb46e363 100644 (file)
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -227,8 +227,8 @@ static void __init configure_exceptions(void)
                         opal_configure_cores();
  
                 /* Enable AIL if supported, and we are in hypervisor mode */
-               if (cpu_has_feature(CPU_FTR_HVMODE) &&
-                   cpu_has_feature(CPU_FTR_ARCH_207S)) {
+               if (early_cpu_has_feature(CPU_FTR_HVMODE) &&
+                   early_cpu_has_feature(CPU_FTR_ARCH_207S)) {
                         unsigned long lpcr = mfspr(SPRN_LPCR);
                         mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3);
                 }
@@ -298,12 +298,12 @@ void __init early_setup(unsigned long dt_ptr)
          */
         configure_exceptions();
  
-       /* Initialize the hash table or TLB handling */
-       early_init_mmu();
-
         /* Apply all the dynamic patching */
         apply_feature_fixups();
  
+       /* Initialize the hash table or TLB handling */
+       early_init_mmu();
+
         /*
          * At this point, we can let interrupts switch to virtual mode
          * (the MMU has been setup), so adjust the MSR in the PACA to
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c

index 5a1f015ea9f36bfea7e55040699eb2acfa237269..25a39052bf6b5653c3ab0317298864a2873fd911 100644 (file)
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -55,6 +55,7 @@
  #include <asm/debug.h>
  #include <asm/kexec.h>
  #include <asm/asm-prototypes.h>
+#include <asm/cpu_has_feature.h>
  
  #ifdef DEBUG
  #include <asm/udbg.h>
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c

index 4e7759c8ca308427d81c7d4a9ec9b29e22855178..3efbedefba6a9fdf41095cd84b599f2d7e8b7748 100644 (file)
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -56,6 +56,7 @@
  #include <linux/irq_work.h>
  #include <linux/clk-provider.h>
  #include <linux/suspend.h>
+#include <linux/rtc.h>
  #include <asm/trace.h>
  
  #include <asm/io.h>
@@ -1159,6 +1160,29 @@ void calibrate_delay(void)
         loops_per_jiffy = tb_ticks_per_jiffy;
  }
  
+#if IS_ENABLED(CONFIG_RTC_DRV_GENERIC)
+static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm)
+{
+       ppc_md.get_rtc_time(tm);
+       return rtc_valid_tm(tm);
+}
+
+static int rtc_generic_set_time(struct device *dev, struct rtc_time *tm)
+{
+       if (!ppc_md.set_rtc_time)
+               return -EOPNOTSUPP;
+
+       if (ppc_md.set_rtc_time(tm) < 0)
+               return -EOPNOTSUPP;
+
+       return 0;
+}
+
+static const struct rtc_class_ops rtc_generic_ops = {
+       .read_time = rtc_generic_get_time,
+       .set_time = rtc_generic_set_time,
+};
+
  static int __init rtc_init(void)
  {
         struct platform_device *pdev;
@@ -1166,9 +1190,12 @@ static int __init rtc_init(void)
         if (!ppc_md.get_rtc_time)
                 return -ENODEV;
  
-       pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0);
+       pdev = platform_device_register_data(NULL, "rtc-generic", -1,
+                                            &rtc_generic_ops,
+                                            sizeof(rtc_generic_ops));
  
         return PTR_ERR_OR_ZERO(pdev);
  }
  
  device_initcall(rtc_init);
+#endif
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c

index defb2998b8183a9588c76bc10a1f575ee28aa46d..74145f02ad417b496ceba07b0f114a2bbd77bc75 100644 (file)
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -13,6 +13,7 @@
   */
  
  #include <linux/types.h>
+#include <linux/jump_label.h>
  #include <linux/kernel.h>
  #include <linux/string.h>
  #include <linux/init.h>
@@ -152,9 +153,18 @@ static void do_final_fixups(void)
  #endif
  }
  
-void apply_feature_fixups(void)
+static unsigned long __initdata saved_cpu_features;
+static unsigned int __initdata saved_mmu_features;
+#ifdef CONFIG_PPC64
+static unsigned long __initdata saved_firmware_features;
+#endif
+
+void __init apply_feature_fixups(void)
  {
-       struct cpu_spec *spec = *PTRRELOC(&cur_cpu_spec);
+       struct cpu_spec *spec = PTRRELOC(*PTRRELOC(&cur_cpu_spec));
+
+       *PTRRELOC(&saved_cpu_features) = spec->cpu_features;
+       *PTRRELOC(&saved_mmu_features) = spec->mmu_features;
  
         /*
          * Apply the CPU-specific and firmware specific fixups to kernel text
@@ -173,11 +183,36 @@ void apply_feature_fixups(void)
                          PTRRELOC(&__stop___lwsync_fixup));
  
  #ifdef CONFIG_PPC64
+       saved_firmware_features = powerpc_firmware_features;
         do_feature_fixups(powerpc_firmware_features,
                           &__start___fw_ftr_fixup, &__stop___fw_ftr_fixup);
  #endif
         do_final_fixups();
+
+       /*
+        * Initialise jump label. This causes all the cpu/mmu_has_feature()
+        * checks to take on their correct polarity based on the current set of
+        * CPU/MMU features.
+        */
+       jump_label_init();
+       cpu_feature_keys_init();
+       mmu_feature_keys_init();
+}
+
+static int __init check_features(void)
+{
+       WARN(saved_cpu_features != cur_cpu_spec->cpu_features,
+            "CPU features changed after feature patching!\n");
+       WARN(saved_mmu_features != cur_cpu_spec->mmu_features,
+            "MMU features changed after feature patching!\n");
+#ifdef CONFIG_PPC64
+       WARN(saved_firmware_features != powerpc_firmware_features,
+            "Firmware features changed after feature patching!\n");
+#endif
+
+       return 0;
  }
+late_initcall(check_features);
  
  #ifdef CONFIG_FTR_FIXUP_SELFTEST
  
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c

index 88ce7d21232090fe1d16d91bffd41eda311716f3..0e4e9654bd2c1fdc07e265d3a7c35b1ab2917dff 100644 (file)
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -72,8 +72,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
                 /* clear out bits after (52) [0....52.....63] */
                 va &= ~((1ul << (64 - 52)) - 1);
                 va |= ssize << 8;
-               sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
-                       ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+               sllp = get_sllp_encoding(apsize);
                 va |= sllp << 5;
                 asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
                              : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
@@ -122,8 +121,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
                 /* clear out bits after(52) [0....52.....63] */
                 va &= ~((1ul << (64 - 52)) - 1);
                 va |= ssize << 8;
-               sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
-                       ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+               sllp = get_sllp_encoding(apsize);
                 va |= sllp << 5;
                 asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
                              : : "r"(va) : "memory");
@@ -749,5 +747,5 @@ void __init hpte_init_native(void)
         mmu_hash_ops.hugepage_invalidate   = native_hugepage_invalidate;
  
         if (cpu_has_feature(CPU_FTR_ARCH_300))
-               ppc_md.register_process_table = native_register_proc_table;
+               register_process_table = native_register_proc_table;
  }
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c

index b78b5d21127864e2678d3b9e69aaff4cbf6cb0c0..0821556e16f4b22e8db28071e8a429499aa5fa87 100644 (file)
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -363,11 +363,6 @@ static int __init htab_dt_scan_seg_sizes(unsigned long node,
         return 0;
  }
  
-static void __init htab_init_seg_sizes(void)
-{
-       of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
-}
-
  static int __init get_idx_from_shift(unsigned int shift)
  {
         int idx = -1;
@@ -539,7 +534,7 @@ static bool might_have_hea(void)
  
  #endif /* #ifdef CONFIG_PPC_64K_PAGES */
  
-static void __init htab_init_page_sizes(void)
+static void __init htab_scan_page_sizes(void)
  {
         int rc;
  
@@ -554,17 +549,23 @@ static void __init htab_init_page_sizes(void)
          * Try to find the available page sizes in the device-tree
          */
         rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
-       if (rc != 0)  /* Found */
-               goto found;
-
-       /*
-        * Not in the device-tree, let's fallback on known size
-        * list for 16M capable GP & GR
-        */
-       if (mmu_has_feature(MMU_FTR_16M_PAGE))
+       if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) {
+               /*
+                * Nothing in the device-tree, but the CPU supports 16M pages,
+                * so let's fallback on a known size list for 16M capable CPUs.
+                */
                 memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
                        sizeof(mmu_psize_defaults_gp));
-found:
+       }
+
+#ifdef CONFIG_HUGETLB_PAGE
+       /* Reserve 16G huge page memory sections for huge pages */
+       of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+#endif /* CONFIG_HUGETLB_PAGE */
+}
+
+static void __init htab_init_page_sizes(void)
+{
         if (!debug_pagealloc_enabled()) {
                 /*
                  * Pick a size for the linear mapping. Currently, we only
@@ -630,11 +631,6 @@ found:
                ,mmu_psize_defs[mmu_vmemmap_psize].shift
  #endif
                );
-
-#ifdef CONFIG_HUGETLB_PAGE
-       /* Reserve 16G huge page memory sections for huge pages */
-       of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
-#endif /* CONFIG_HUGETLB_PAGE */
  }
  
  static int __init htab_dt_scan_pftsize(unsigned long node,
@@ -759,12 +755,6 @@ static void __init htab_initialize(void)
  
         DBG(" -> htab_initialize()\n");
  
-       /* Initialize segment sizes */
-       htab_init_seg_sizes();
-
-       /* Initialize page sizes */
-       htab_init_page_sizes();
-
         if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
                 mmu_kernel_ssize = MMU_SEGSIZE_1T;
                 mmu_highuser_ssize = MMU_SEGSIZE_1T;
@@ -885,8 +875,19 @@ static void __init htab_initialize(void)
  #undef KB
  #undef MB
  
+void __init hash__early_init_devtree(void)
+{
+       /* Initialize segment sizes */
+       of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
+
+       /* Initialize page sizes */
+       htab_scan_page_sizes();
+}
+
  void __init hash__early_init_mmu(void)
  {
+       htab_init_page_sizes();
+
         /*
          * initialize page table size
          */
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c

index 1e11559e1aac274192dd117e42253b189e6e8e90..35254a6784561b6f5f70399822cb163f8e59b14d 100644 (file)
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -5,39 +5,34 @@
  #include <asm/cacheflush.h>
  #include <asm/machdep.h>
  #include <asm/mman.h>
+#include <asm/tlb.h>
  
  void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
  {
-       unsigned long ap, shift;
+       int psize;
         struct hstate *hstate = hstate_file(vma->vm_file);
  
-       shift = huge_page_shift(hstate);
-       if (shift == mmu_psize_defs[MMU_PAGE_2M].shift)
-               ap = mmu_get_ap(MMU_PAGE_2M);
-       else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
-               ap = mmu_get_ap(MMU_PAGE_1G);
-       else {
-               WARN(1, "Wrong huge page shift\n");
-               return ;
-       }
-       radix___flush_tlb_page(vma->vm_mm, vmaddr, ap, 0);
+       psize = hstate_get_psize(hstate);
+       radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
  }
  
  void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
  {
-       unsigned long ap, shift;
+       int psize;
         struct hstate *hstate = hstate_file(vma->vm_file);
  
-       shift = huge_page_shift(hstate);
-       if (shift == mmu_psize_defs[MMU_PAGE_2M].shift)
-               ap = mmu_get_ap(MMU_PAGE_2M);
-       else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
-               ap = mmu_get_ap(MMU_PAGE_1G);
-       else {
-               WARN(1, "Wrong huge page shift\n");
-               return ;
-       }
-       radix___local_flush_tlb_page(vma->vm_mm, vmaddr, ap, 0);
+       psize = hstate_get_psize(hstate);
+       radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
+}
+
+void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start,
+                                  unsigned long end)
+{
+       int psize;
+       struct hstate *hstate = hstate_file(vma->vm_file);
+
+       psize = hstate_get_psize(hstate);
+       radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
  }
  
  /*
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c

index 33709bdb04196ae3dfac4cb3f7463db2210a3449..16ada1eb7e26393cc4663169ea51e3dc85b0d369 100644 (file)
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -411,3 +411,25 @@ struct page *realmode_pfn_to_page(unsigned long pfn)
  EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
  
  #endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
+
+#ifdef CONFIG_PPC_STD_MMU_64
+static bool disable_radix;
+static int __init parse_disable_radix(char *p)
+{
+       disable_radix = true;
+       return 0;
+}
+early_param("disable_radix", parse_disable_radix);
+
+void __init mmu_early_init_devtree(void)
+{
+       /* Disable radix mode based on kernel command line. */
+       if (disable_radix)
+               cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+
+       if (early_radix_enabled())
+               radix__early_init_devtree();
+       else
+               hash__early_init_devtree();
+}
+#endif /* CONFIG_PPC_STD_MMU_64 */
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c

index 670318766545c6fb029e4acfb7711be5ca7fe90d..34079302cc17488e151c96c121247d2bfb0f1a1f 100644 (file)
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -14,6 +14,9 @@
  #include "mmu_decl.h"
  #include <trace/events/thp.h>
  
+int (*register_process_table)(unsigned long base, unsigned long page_size,
+                             unsigned long tbl_size);
+
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  /*
   * This is called when relaxing access to a hugepage. It's also called in the page
@@ -33,7 +36,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
         changed = !pmd_same(*(pmdp), entry);
         if (changed) {
                 __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
-               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+               flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
         }
         return changed;
  }
@@ -66,7 +69,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                      pmd_t *pmdp)
  {
         pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
-       flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
         /*
          * This ensures that generic code that rely on IRQ disabling
          * to prevent a parallel THP split work as expected.
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c

index 003ff48a11b6212888fac831ad8ab7057d2e84cd..af897d91d09f4929c06b09aadf5b67d829f13ea9 100644 (file)
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -171,7 +171,7 @@ redo:
          * of process table here. But our linear mapping also enable us to use
          * physical address here.
          */
-       ppc_md.register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
+       register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
         pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
  }
  
@@ -198,7 +198,7 @@ static void __init radix_init_partition_table(void)
  
  void __init radix_init_native(void)
  {
-       ppc_md.register_process_table = native_register_process_table;
+       register_process_table = native_register_process_table;
  }
  
  static int __init get_idx_from_shift(unsigned int shift)
@@ -264,7 +264,7 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
         return 1;
  }
  
-static void __init radix_init_page_sizes(void)
+void __init radix__early_init_devtree(void)
  {
         int rc;
  
@@ -343,7 +343,6 @@ void __init radix__early_init_mmu(void)
         __pte_frag_nr = H_PTE_FRAG_NR;
         __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
  
-       radix_init_page_sizes();
         if (!firmware_has_feature(FW_FEATURE_LPAR)) {
                 radix_init_native();
                 lpcr = mfspr(SPRN_LPCR);
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c

index 88a307504b5a0f3798be84c282ab09de47168452..0b6fb244d0a16a4c4e40f7e3f8d15aad83223a3f 100644 (file)
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -225,7 +225,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
                 if (!is_vm_hugetlb_page(vma))
                         assert_pte_locked(vma->vm_mm, address);
                 __ptep_set_access_flags(ptep, entry);
-               flush_tlb_page_nohash(vma, address);
+               flush_tlb_page(vma, address);
         }
         return changed;
  }
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c

index e1f22700fb169941b869c1c085c7b284df9d078a..48df05ef523100e9aa67eb29fadf8b22cd116daf 100644 (file)
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -140,10 +140,11 @@ void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
  }
  EXPORT_SYMBOL(radix__local_flush_tlb_pwc);
  
-void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
-                           unsigned long ap, int nid)
+void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+                                      int psize)
  {
         unsigned long pid;
+       unsigned long ap = mmu_get_ap(psize);
  
         preempt_disable();
         pid = mm ? mm->context.id : 0;
@@ -159,18 +160,12 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
         if (vma && is_vm_hugetlb_page(vma))
                 return __local_flush_hugetlb_page(vma, vmaddr);
  #endif
-       radix___local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
-                              mmu_get_ap(mmu_virtual_psize), 0);
+       radix__local_flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr,
+                                         mmu_virtual_psize);
  }
  EXPORT_SYMBOL(radix__local_flush_tlb_page);
  
  #ifdef CONFIG_SMP
-static int mm_is_core_local(struct mm_struct *mm)
-{
-       return cpumask_subset(mm_cpumask(mm),
-                             topology_sibling_cpumask(smp_processor_id()));
-}
-
  void radix__flush_tlb_mm(struct mm_struct *mm)
  {
         unsigned long pid;
@@ -221,10 +216,11 @@ no_context:
  }
  EXPORT_SYMBOL(radix__flush_tlb_pwc);
  
-void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
-                      unsigned long ap, int nid)
+void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+                                int psize)
  {
         unsigned long pid;
+       unsigned long ap = mmu_get_ap(psize);
  
         preempt_disable();
         pid = mm ? mm->context.id : 0;
@@ -250,8 +246,8 @@ void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
         if (vma && is_vm_hugetlb_page(vma))
                 return flush_hugetlb_page(vma, vmaddr);
  #endif
-       radix___flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
-                        mmu_get_ap(mmu_virtual_psize), 0);
+       radix__flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr,
+                                   mmu_virtual_psize);
  }
  EXPORT_SYMBOL(radix__flush_tlb_page);
  
@@ -299,8 +295,65 @@ static int radix_get_mmu_psize(int page_size)
  
  void radix__tlb_flush(struct mmu_gather *tlb)
  {
+       int psize = 0;
         struct mm_struct *mm = tlb->mm;
-       radix__flush_tlb_mm(mm);
+       int page_size = tlb->page_size;
+
+       psize = radix_get_mmu_psize(page_size);
+       /*
+        * if page size is not something we understand, do a full mm flush
+        */
+       if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
+               radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
+       else
+               radix__flush_tlb_mm(mm);
+}
+
+#define TLB_FLUSH_ALL -1UL
+/*
+ * Number of pages above which we will do a bcast tlbie. Just a
+ * number at this point copied from x86
+ */
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+
+void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+                                 unsigned long end, int psize)
+{
+       unsigned long pid;
+       unsigned long addr;
+       int local = mm_is_core_local(mm);
+       unsigned long ap = mmu_get_ap(psize);
+       int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+       unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
+
+
+       preempt_disable();
+       pid = mm ? mm->context.id : 0;
+       if (unlikely(pid == MMU_NO_CONTEXT))
+               goto err_out;
+
+       if (end == TLB_FLUSH_ALL ||
+           (end - start) > tlb_single_page_flush_ceiling * page_size) {
+               if (local)
+                       _tlbiel_pid(pid, RIC_FLUSH_TLB);
+               else
+                       _tlbie_pid(pid, RIC_FLUSH_TLB);
+               goto err_out;
+       }
+       for (addr = start; addr < end; addr += page_size) {
+
+               if (local)
+                       _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+               else {
+                       if (lock_tlbie)
+                               raw_spin_lock(&native_tlbie_lock);
+                       _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+                       if (lock_tlbie)
+                               raw_spin_unlock(&native_tlbie_lock);
+               }
+       }
+err_out:
+       preempt_enable();
  }
  
  void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa,
@@ -340,3 +393,10 @@ void radix__flush_tlb_lpid(unsigned long lpid)
         asm volatile("eieio; tlbsync; ptesync": : :"memory");
  }
  EXPORT_SYMBOL(radix__flush_tlb_lpid);
+
+void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
+                               unsigned long start, unsigned long end)
+{
+       radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M);
+}
+EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c

index 558e30cce33eed7017fbe3066e1868dd40a95db3..702d7689d714e2173353a7df38c9899e66854351 100644 (file)
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -48,17 +48,6 @@ void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
  }
  EXPORT_SYMBOL(flush_hash_entry);
  
-/*
- * Called by ptep_set_access_flags, must flush on CPUs for which the
- * DSI handler can't just "fixup" the TLB on a write fault
- */
-void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr)
-{
-       if (Hash != 0)
-               return;
-       _tlbie(addr);
-}
-
  /*
   * Called at the end of a mmu_gather operation to make sure the
   * TLB flush is completely done.
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c

index f4668488512c46d278cc1742672e741a9016639b..050badc0ebd3446f22677aea4d9e516d9b3801bd 100644 (file)
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -215,12 +215,6 @@ EXPORT_SYMBOL(local_flush_tlb_page);
  
  static DEFINE_RAW_SPINLOCK(tlbivax_lock);
  
-static int mm_is_core_local(struct mm_struct *mm)
-{
-       return cpumask_subset(mm_cpumask(mm),
-                             topology_sibling_cpumask(smp_processor_id()));
-}
-
  struct tlb_flush_param {
         unsigned long addr;
         unsigned int pid;
diff --git a/arch/powerpc/perf/power9-events-list.h b/arch/powerpc/perf/power9-events-list.h

index cda6fcb809ca25a9d10d9968c6798d16564bbd14..6447dc1c3d896cea18615d3b5bacc4bb6285fbb1 100644 (file)
--- a/arch/powerpc/perf/power9-events-list.h
+++ b/arch/powerpc/perf/power9-events-list.h
@@ -34,15 +34,15 @@ EVENT(PM_L1_ICACHE_MISS,                    0x200fd)
  /* Instruction Demand sectors wriittent into IL1 */
  EVENT(PM_L1_DEMAND_WRITE,                      0x0408c)
  /* Instruction prefetch written into IL1 */
-EVENT(PM_IC_PREF_WRITE,                                0x0408e)
+EVENT(PM_IC_PREF_WRITE,                                0x0488c)
  /* The data cache was reloaded from local core's L3 due to a demand load */
  EVENT(PM_DATA_FROM_L3,                         0x4c042)
  /* Demand LD - L3 Miss (not L2 hit and not L3 hit) */
  EVENT(PM_DATA_FROM_L3MISS,                     0x300fe)
  /* All successful D-side store dispatches for this thread */
-EVENT(PM_L2_ST,                                        0x16081)
+EVENT(PM_L2_ST,                                        0x16880)
  /* All successful D-side store dispatches for this thread that were L2 Miss */
-EVENT(PM_L2_ST_MISS,                           0x26081)
+EVENT(PM_L2_ST_MISS,                           0x26880)
  /* Total HW L3 prefetches(Load+store) */
  EVENT(PM_L3_PREF_ALL,                          0x4e052)
  /* Data PTEG reload */
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig

index 3663f71fd913badd0f9c084be762902c547382e4..fbdae8377b71452cf2afb8b2e24e36a5d9ccd9b0 100644 (file)
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -321,6 +321,17 @@ config OF_RTC
           Uses information from the OF or flattened device tree to instantiate
           platform devices for direct mapped RTC chips like the DS1742 or DS1743.
  
+config GEN_RTC
+       bool "Use the platform RTC operations from user space"
+       select RTC_CLASS
+       select RTC_DRV_GENERIC
+       help
+         This option provides backwards compatibility with the old gen_rtc.ko
+         module that was traditionally used for old PowerPC machines.
+         Platforms should migrate to enabling the RTC_DRV_GENERIC by hand
+         replacing their get_rtc_time/set_rtc_time callbacks with
+         a proper RTC device driver.
+
  config SIMPLE_GPIO
         bool "Support for simple, memory-mapped GPIO controllers"
         depends on PPC
diff --git a/arch/powerpc/platforms/cell/pervasive.c b/arch/powerpc/platforms/cell/pervasive.c

index d17e98bc0c10d3ff0469cfa8af873f08d57abded..e7d075077cb045399cb90a65547cb2a13649a86f 100644 (file)
--- a/arch/powerpc/platforms/cell/pervasive.c
+++ b/arch/powerpc/platforms/cell/pervasive.c
@@ -35,6 +35,7 @@
  #include <asm/pgtable.h>
  #include <asm/reg.h>
  #include <asm/cell-regs.h>
+#include <asm/cpu_has_feature.h>
  
  #include "pervasive.h"
  
diff --git a/arch/powerpc/platforms/ps3/time.c b/arch/powerpc/platforms/ps3/time.c

index 791c6142c4a7bd0185437890350381c529779a54..11b45b58c81bd77956d4fa1f0afd2a3357dafd8f 100644 (file)
--- a/arch/powerpc/platforms/ps3/time.c
+++ b/arch/powerpc/platforms/ps3/time.c
@@ -20,9 +20,9 @@
  
  #include <linux/kernel.h>
  #include <linux/platform_device.h>
+#include <linux/rtc.h>
  
  #include <asm/firmware.h>
-#include <asm/rtc.h>
  #include <asm/lv1call.h>
  #include <asm/ps3.h>
  
diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c

index 984e816f3fafaaf7e40dfa85a5a45462a70c23ae..68e7c0dd2e45551143b6afc079fd185d2ca89a80 100644 (file)
--- a/arch/powerpc/sysdev/fsl_rio.c
+++ b/arch/powerpc/sysdev/fsl_rio.c
@@ -491,6 +491,7 @@ int fsl_rio_setup(struct platform_device *dev)
         rmu_node = of_parse_phandle(dev->dev.of_node, "fsl,srio-rmu-handle", 0);
         if (!rmu_node) {
                 dev_err(&dev->dev, "No valid fsl,srio-rmu-handle property\n");
+               rc = -ENOENT;
                 goto err_rmu;
         }
         rc = of_address_to_resource(rmu_node, 0, &rmu_regs);
diff --git a/arch/powerpc/xmon/ppc-dis.c b/arch/powerpc/xmon/ppc-dis.c

index 89098f320ad570577328ff42ba4e426a5d22d37f..ee98917341496905876dadc6e51cdfd4b9ced815 100644 (file)
--- a/arch/powerpc/xmon/ppc-dis.c
+++ b/arch/powerpc/xmon/ppc-dis.c
@@ -20,6 +20,7 @@ along with this file; see the file COPYING.  If not, write to the Free
  Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
  
  #include <asm/cputable.h>
+#include <asm/cpu_has_feature.h>
  #include "nonstdio.h"
  #include "ansidecl.h"
  #include "ppc.h"
diff --git a/arch/sh/include/asm/mc146818rtc.h b/arch/sh/include/asm/mc146818rtc.h

deleted file mode 100644 (file)

index 0aee96a..0000000
--- a/arch/sh/include/asm/mc146818rtc.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/*
- * Machine dependent access functions for RTC registers.
- */
-#ifndef _ASM_MC146818RTC_H
-#define _ASM_MC146818RTC_H
-
-#endif /* _ASM_MC146818RTC_H */
diff --git a/arch/sh/include/asm/rtc.h b/arch/sh/include/asm/rtc.h

index 52b0c2dba979c8a40c6747bb42e7720ba00d66df..f7b010d48af7b17d8c6dbf475d105f7839f816a5 100644 (file)
--- a/arch/sh/include/asm/rtc.h
+++ b/arch/sh/include/asm/rtc.h
@@ -6,17 +6,6 @@ extern void (*board_time_init)(void);
  extern void (*rtc_sh_get_time)(struct timespec *);
  extern int (*rtc_sh_set_time)(const time_t);
  
-/* some dummy definitions */
-#define RTC_BATT_BAD 0x100     /* battery bad */
-#define RTC_SQWE 0x08          /* enable square-wave output */
-#define RTC_DM_BINARY 0x04     /* all time/date values are BCD if clear */
-#define RTC_24H 0x02           /* 24 hour mode - else hours bit 7 means pm */
-#define RTC_DST_EN 0x01                /* auto switch DST - works f. USA only */
-
-struct rtc_time;
-unsigned int get_rtc_time(struct rtc_time *);
-int set_rtc_time(struct rtc_time *);
-
  #define RTC_CAP_4_DIGIT_YEAR   (1 << 0)
  
  struct sh_rtc_platform_info {
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c

index d6d0a986c6e937680186a32bbfd602c5ff40d10e..a4a7862b489a70ac7fe1b405a4586252e6b90ae1 100644 (file)
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -50,27 +50,31 @@ int update_persistent_clock(struct timespec now)
  }
  #endif
  
-unsigned int get_rtc_time(struct rtc_time *tm)
+static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm)
  {
-       if (rtc_sh_get_time != null_rtc_get_time) {
-               struct timespec tv;
+       struct timespec tv;
  
-               rtc_sh_get_time(&tv);
-               rtc_time_to_tm(tv.tv_sec, tm);
-       }
-
-       return RTC_24H;
+       rtc_sh_get_time(&tv);
+       rtc_time_to_tm(tv.tv_sec, tm);
+       return 0;
  }
-EXPORT_SYMBOL(get_rtc_time);
  
-int set_rtc_time(struct rtc_time *tm)
+static int rtc_generic_set_time(struct device *dev, struct rtc_time *tm)
  {
         unsigned long secs;
  
         rtc_tm_to_time(tm, &secs);
-       return rtc_sh_set_time(secs);
+       if ((rtc_sh_set_time == null_rtc_set_time) ||
+           (rtc_sh_set_time(secs) < 0))
+               return -EOPNOTSUPP;
+
+       return 0;
  }
-EXPORT_SYMBOL(set_rtc_time);
+
+static const struct rtc_class_ops rtc_generic_ops = {
+       .read_time = rtc_generic_get_time,
+       .set_time = rtc_generic_set_time,
+};
  
  static int __init rtc_generic_init(void)
  {
@@ -79,7 +83,10 @@ static int __init rtc_generic_init(void)
         if (rtc_sh_get_time == null_rtc_get_time)
                 return -ENODEV;
  
-       pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0);
+       pdev = platform_device_register_data(NULL, "rtc-generic", -1,
+                                            &rtc_generic_ops,
+                                            sizeof(rtc_generic_ops));
+
  
         return PTR_ERR_OR_ZERO(pdev);
  }
diff --git a/arch/sparc/include/asm/io_32.h b/arch/sparc/include/asm/io_32.h

index 57f26c398dc9d21961e3d811ac1e11c8ea4ccf71..4dd268a3a8b083942210e6c3892c9e43de1c2c70 100644 (file)
--- a/arch/sparc/include/asm/io_32.h
+++ b/arch/sparc/include/asm/io_32.h
@@ -140,16 +140,6 @@ void ioport_unmap(void __iomem *);
  struct pci_dev;
  void pci_iounmap(struct pci_dev *dev, void __iomem *);
  
-
-
-/*
- * At the moment, we do not use CMOS_READ anywhere outside of rtc.c,
- * so rtc_port is static in it. This should not change unless a new
- * hardware pops up.
- */
-#define RTC_PORT(x)   (rtc_port + (x))
-#define RTC_ALWAYS_BCD  0
-
  static inline int sbus_can_dma_64bit(void)
  {
         return 0; /* actually, sparc_cpu_model==sun4d */
diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common

index 58650d098fb417911ad0be5b14dd4ee5d508e78e..fd443852103c998fd997c04ee64ebb7c32716785 100644 (file)
--- a/arch/um/Kconfig.common
+++ b/arch/um/Kconfig.common
@@ -1,10 +1,12 @@
  config UML
         bool
         default y
+       select ARCH_HAS_KCOV
         select HAVE_ARCH_AUDITSYSCALL
         select HAVE_ARCH_SECCOMP_FILTER
         select HAVE_UID16
         select HAVE_FUTEX_CMPXCHG if FUTEX
+       select HAVE_DEBUG_KMEMLEAK
         select GENERIC_IRQ_SHOW
         select GENERIC_CPU_DEVICES
         select GENERIC_IO
@@ -31,10 +33,9 @@ config PCI
  config PCMCIA
         bool
  
-# Yet to do!
  config TRACE_IRQFLAGS_SUPPORT
         bool
-       default n
+       default y
  
  config LOCKDEP_SUPPORT
         bool
diff --git a/arch/um/include/asm/irqflags.h b/arch/um/include/asm/irqflags.h

index c780d8a1677351ab99cd4bdf643e5d363ac4e9e5..3bb221e1d5a4dd3b684ebde22ca56bfecda55bbe 100644 (file)
--- a/arch/um/include/asm/irqflags.h
+++ b/arch/um/include/asm/irqflags.h
@@ -6,37 +6,33 @@ extern int set_signals(int enable);
  extern void block_signals(void);
  extern void unblock_signals(void);
  
+#define arch_local_save_flags arch_local_save_flags
  static inline unsigned long arch_local_save_flags(void)
  {
         return get_signals();
  }
  
+#define arch_local_irq_restore arch_local_irq_restore
  static inline void arch_local_irq_restore(unsigned long flags)
  {
         set_signals(flags);
  }
  
+#define arch_local_irq_enable arch_local_irq_enable
  static inline void arch_local_irq_enable(void)
  {
         unblock_signals();
  }
  
+#define arch_local_irq_disable arch_local_irq_disable
  static inline void arch_local_irq_disable(void)
  {
         block_signals();
  }
  
-static inline unsigned long arch_local_irq_save(void)
-{
-       unsigned long flags;
-       flags = arch_local_save_flags();
-       arch_local_irq_disable();
-       return flags;
-}
+#define ARCH_IRQ_DISABLED      0
+#define ARCh_IRQ_ENABLED       (SIGIO|SIGVTALRM)
  
-static inline bool arch_irqs_disabled(void)
-{
-       return arch_local_save_flags() == 0;
-}
+#include <asm-generic/irqflags.h>
  
  #endif
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile

index a6a5e42caaef539cb24da8e046bdc998ecbdf976..2f36d515762ecbcaf9507f00b08fa6b955b919f9 100644 (file)
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -3,6 +3,11 @@
  # Licensed under the GPL
  #
  
+# Don't instrument UML-specific code; without this, we may crash when
+# accessing the instrumentation buffer for the first time from the
+# kernel.
+KCOV_INSTRUMENT                := n
+
  CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START)           \
                          -DELF_ARCH=$(LDS_ELF_ARCH)     \
                          -DELF_FORMAT=$(LDS_ELF_FORMAT) \
diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c

index 55cead809b18baccf804dd89dafec13c37eb0af0..48bae81f8dcab2febedc7238f190b985e5da4a13 100644 (file)
--- a/arch/um/kernel/initrd.c
+++ b/arch/um/kernel/initrd.c
@@ -37,8 +37,6 @@ static int __init read_initrd(void)
         }
  
         area = alloc_bootmem(size);
-       if (area == NULL)
-               return 0;
  
         if (load_initrd(initrd, area, size) == -1)
                 return 0;
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c

index 16630e75f056de31a9b511995636c838d644775e..e8175a8aa22c7b8c7c2f4ea2daf4addd1c8eb666 100644 (file)
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -319,9 +319,6 @@ int __init linux_main(int argc, char **argv)
  
         start_vm = VMALLOC_START;
  
-       setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem);
-       mem_total_pages(physmem_size, iomem_size, highmem);
-
         virtmem_size = physmem_size;
         stack = (unsigned long) argv;
         stack &= ~(1024 * 1024 - 1);
@@ -334,7 +331,6 @@ int __init linux_main(int argc, char **argv)
                 printf("Kernel virtual memory size shrunk to %lu bytes\n",
                        virtmem_size);
  
-       stack_protections((unsigned long) &init_thread_info);
         os_flush_stdout();
  
         return start_uml();
@@ -342,6 +338,10 @@ int __init linux_main(int argc, char **argv)
  
  void __init setup_arch(char **cmdline_p)
  {
+       stack_protections((unsigned long) &init_thread_info);
+       setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem);
+       mem_total_pages(physmem_size, iomem_size, highmem);
+
         paging_init();
         strlcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
         *cmdline_p = command_line;
diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile

index 08ff5094fcdd9c82897d32b1c877284803b497b2..ada473bf6f46e536048a8589f7af67edd6048f1a 100644 (file)
--- a/arch/um/os-Linux/Makefile
+++ b/arch/um/os-Linux/Makefile
@@ -3,6 +3,9 @@
  # Licensed under the GPL
  #
  
+# Don't instrument UML-specific code
+KCOV_INSTRUMENT                := n
+
  obj-y = aio.o execvp.o file.o helper.o irq.o main.o mem.o process.o \
         registers.o sigio.o signal.o start_up.o time.o tty.o \
         umid.o user_syms.o util.o drivers/ skas/
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c

index 8acaf4e384c0fc45612819f18b97cfe1ce3e1771..a86d7cc2c2d82fa7d0e5c13d1f3188efcd660fba 100644 (file)
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -15,6 +15,7 @@
  #include <kern_util.h>
  #include <os.h>
  #include <sysdep/mcontext.h>
+#include <um_malloc.h>
  
  void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = {
         [SIGTRAP]       = relay_signal,
@@ -32,7 +33,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
         struct uml_pt_regs *r;
         int save_errno = errno;
  
-       r = malloc(sizeof(struct uml_pt_regs));
+       r = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC);
         if (!r)
                 panic("out of memory");
  
@@ -91,7 +92,7 @@ static void timer_real_alarm_handler(mcontext_t *mc)
  {
         struct uml_pt_regs *regs;
  
-       regs = malloc(sizeof(struct uml_pt_regs));
+       regs = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC);
         if (!regs)
                 panic("out of memory");
  
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index 3a9add58d794e698d059d807de749c57a6789d43..5c6e7471b732335bf0b4272a4f274520d2bb4f4e 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -152,6 +152,7 @@ config X86
         select OLD_SIGSUSPEND3                  if X86_32 || IA32_EMULATION
         select PERF_EVENTS
         select RTC_LIB
+       select RTC_MC146818_LIB
         select SPARSE_IRQ
         select SRCU
         select SYSCTL_EXCEPTION_TRACE
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h

index 0f555cc3198419b3b60a9121702cfe09c11dd197..24acd9ba7837c71a9d42bc567648c506689d2998 100644 (file)
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -6,7 +6,6 @@
  
  #include <asm/io.h>
  #include <asm/processor.h>
-#include <linux/mc146818rtc.h>
  
  #ifndef RTC_PORT
  #define RTC_PORT(x)    (0x70 + (x))
diff --git a/arch/x86/include/asm/rtc.h b/arch/x86/include/asm/rtc.h

deleted file mode 100644 (file)

index f71c3b0..0000000
--- a/arch/x86/include/asm/rtc.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/rtc.h>
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c

index 3d747070fe6702a1c3bdf764c8de3bfb3f202038..ed16e58658a4201184fd0b1d8c470f5eb3e32c1e 100644 (file)
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1019,7 +1019,6 @@ void hpet_disable(void)
   */
  #include <linux/mc146818rtc.h>
  #include <linux/rtc.h>
-#include <asm/rtc.h>
  
  #define DEFAULT_RTC_INT_FREQ   64
  #define DEFAULT_RTC_SHIFT      6
@@ -1243,7 +1242,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
         memset(&curr_time, 0, sizeof(struct rtc_time));
  
         if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
-               get_rtc_time(&curr_time);
+               mc146818_set_time(&curr_time);
  
         if (hpet_rtc_flags & RTC_UIE &&
             curr_time.tm_sec != hpet_prev_update_sec) {
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c

index 04b132a767f116e8ff35efcbbc036e331a145a4b..bfe4d6c96fbd8ff8563cf0f69947251fe837c4cc 100644 (file)
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -17,6 +17,7 @@
  #include <linux/debugfs.h>
  #include <linux/delay.h>
  #include <linux/hardirq.h>
+#include <linux/ratelimit.h>
  #include <linux/slab.h>
  #include <linux/export.h>
  
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c

index eceaa082ec3fcb1b0f98cb11a6d4723f5dbc3d73..79c6311cd91252b9fe6b618b88d95b6ec4036bd2 100644 (file)
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -13,7 +13,6 @@
  #include <asm/x86_init.h>
  #include <asm/time.h>
  #include <asm/intel-mid.h>
-#include <asm/rtc.h>
  #include <asm/setup.h>
  
  #ifdef CONFIG_X86_32
@@ -47,7 +46,7 @@ int mach_set_rtc_mmss(const struct timespec *now)
  
         rtc_time_to_tm(nowtime, &tm);
         if (!rtc_valid_tm(&tm)) {
-               retval = set_rtc_time(&tm);
+               retval = mc146818_set_time(&tm);
                 if (retval)
                         printk(KERN_ERR "%s: RTC write failed with error %d\n",
                                __func__, retval);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c

index 17c8bbd4e2f0928634e67b00e86b63c24f381494..1fbb408e2e721837e7d28a9cdfdf59ffd96973d3 100644 (file)
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -51,7 +51,6 @@
  #include <asm/cacheflush.h>
  #include <asm/tlbflush.h>
  #include <asm/x86_init.h>
-#include <asm/rtc.h>
  #include <asm/uv/uv.h>
  
  static struct efi efi_phys __initdata;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c

index 04db6fbce96db5698ac2539044c613bf0ee83656..677e29e294732560e2a1e66edbaa57a94b5d5cd9 100644 (file)
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -25,6 +25,7 @@
  #include <linux/bootmem.h>
  #include <linux/ioport.h>
  #include <linux/init.h>
+#include <linux/mc146818rtc.h>
  #include <linux/efi.h>
  #include <linux/uaccess.h>
  #include <linux/io.h>
diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c

index ee40fcb6e54dd5f816819bf2f048dcc9177ce95c..58024862a7eb304617fb4270fa628d5d4e0dd23e 100644 (file)
--- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c
+++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
@@ -22,6 +22,7 @@
  #include <linux/init.h>
  #include <linux/sfi.h>
  #include <linux/platform_device.h>
+#include <linux/mc146818rtc.h>
  
  #include <asm/intel-mid.h>
  #include <asm/intel_mid_vrtc.h>
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c

index f2b5e6a5cf956102905f64462db824a8a355cca5..f0b5f2d402afb15f639be87f9581c44c7298bdef 100644 (file)
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -37,11 +37,11 @@ unsigned long jump_address_phys;
   */
  unsigned long restore_cr3 __visible;
  
-pgd_t *temp_level4_pgt __visible;
+unsigned long temp_level4_pgt __visible;
  
  unsigned long relocated_restore_code __visible;
  
-static int set_up_temporary_text_mapping(void)
+static int set_up_temporary_text_mapping(pgd_t *pgd)
  {
         pmd_t *pmd;
         pud_t *pud;
@@ -71,7 +71,7 @@ static int set_up_temporary_text_mapping(void)
                 __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC));
         set_pud(pud + pud_index(restore_jump_address),
                 __pud(__pa(pmd) | _KERNPG_TABLE));
-       set_pgd(temp_level4_pgt + pgd_index(restore_jump_address),
+       set_pgd(pgd + pgd_index(restore_jump_address),
                 __pgd(__pa(pud) | _KERNPG_TABLE));
  
         return 0;
@@ -90,15 +90,16 @@ static int set_up_temporary_mappings(void)
                 .kernel_mapping = true,
         };
         unsigned long mstart, mend;
+       pgd_t *pgd;
         int result;
         int i;
  
-       temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
-       if (!temp_level4_pgt)
+       pgd = (pgd_t *)get_safe_page(GFP_ATOMIC);
+       if (!pgd)
                 return -ENOMEM;
  
         /* Prepare a temporary mapping for the kernel text */
-       result = set_up_temporary_text_mapping();
+       result = set_up_temporary_text_mapping(pgd);
         if (result)
                 return result;
  
@@ -107,13 +108,12 @@ static int set_up_temporary_mappings(void)
                 mstart = pfn_mapped[i].start << PAGE_SHIFT;
                 mend   = pfn_mapped[i].end << PAGE_SHIFT;
  
-               result = kernel_ident_mapping_init(&info, temp_level4_pgt,
-                                                  mstart, mend);
-
+               result = kernel_ident_mapping_init(&info, pgd, mstart, mend);
                 if (result)
                         return result;
         }
  
+       temp_level4_pgt = (unsigned long)pgd - __PAGE_OFFSET;
         return 0;
  }
  
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S

index 8eee0e9c93f0c857cbbd6b23545ecbfe5d252f7e..ce8da3a0412cbb1a715b56e4c2f41cc431fe9965 100644 (file)
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -72,8 +72,6 @@ ENTRY(restore_image)
         /* code below has been relocated to a safe page */
  ENTRY(core_restore_code)
         /* switch to temporary page tables */
-       movq    $__PAGE_OFFSET, %rcx
-       subq    %rcx, %rax
         movq    %rax, %cr3
         /* flush TLB */
         movq    %rbx, %rcx
diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile

index 6c803ca49b5d2e12f5fcd0541febe92fdccee15c..d72dec406ccbee1be87b64747d4d2aaaf13bc301 100644 (file)
--- a/arch/x86/um/vdso/Makefile
+++ b/arch/x86/um/vdso/Makefile
@@ -2,6 +2,9 @@
  # Building vDSO images for x86.
  #
  
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT                := n
+
  VDSO64-y               := y
  
  vdso-install-$(VDSO64-y)       += vdso.so
diff --git a/drivers/acpi/acpi_cmos_rtc.c b/drivers/acpi/acpi_cmos_rtc.c

index 81dc75033f1590111d488216302f475c42b42137..0980a133916fa0781f59f5d470309567a1c23294 100644 (file)
--- a/drivers/acpi/acpi_cmos_rtc.c
+++ b/drivers/acpi/acpi_cmos_rtc.c
@@ -14,7 +14,7 @@
  #include <linux/err.h>
  #include <linux/kernel.h>
  #include <linux/module.h>
-#include <asm-generic/rtc.h>
+#include <linux/mc146818rtc.h>
  
  #include "internal.h"
  
diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c

index 148f4e5ca104b5a392008f6ebf9e864c8afaddbc..31abb0bdd4f29881a9590bed57f9f0113fc5da19 100644 (file)
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c
@@ -232,8 +232,10 @@ remove_dev_dir:
         acpi_device_dir(device) = NULL;
  remove_lid_dir:
         remove_proc_entry(ACPI_BUTTON_SUBCLASS_LID, acpi_button_dir);
+       acpi_lid_dir = NULL;
  remove_button_dir:
         remove_proc_entry(ACPI_BUTTON_CLASS, acpi_root_dir);
+       acpi_button_dir = NULL;
         goto done;
  }
  
@@ -250,7 +252,9 @@ static int acpi_button_remove_fs(struct acpi_device *device)
                           acpi_lid_dir);
         acpi_device_dir(device) = NULL;
         remove_proc_entry(ACPI_BUTTON_SUBCLASS_LID, acpi_button_dir);
+       acpi_lid_dir = NULL;
         remove_proc_entry(ACPI_BUTTON_CLASS, acpi_root_dir);
+       acpi_button_dir = NULL;
  
         return 0;
  }
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c

index 999a109146787f0a6b1b43273daa552c4b1c36de..e7bd57cc550a64663c1a5f3016b951141bdb2b40 100644 (file)
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -101,6 +101,7 @@ enum ec_command {
  #define ACPI_EC_UDELAY_POLL    550     /* Wait 1ms for EC transaction polling */
  #define ACPI_EC_CLEAR_MAX      100     /* Maximum number of events to query
                                          * when trying to clear the EC */
+#define ACPI_EC_MAX_QUERIES    16      /* Maximum number of parallel queries */
  
  enum {
         EC_FLAGS_QUERY_PENDING,         /* Query is pending */
@@ -121,6 +122,10 @@ static unsigned int ec_delay __read_mostly = ACPI_EC_DELAY;
  module_param(ec_delay, uint, 0644);
  MODULE_PARM_DESC(ec_delay, "Timeout(ms) waited until an EC command completes");
  
+static unsigned int ec_max_queries __read_mostly = ACPI_EC_MAX_QUERIES;
+module_param(ec_max_queries, uint, 0644);
+MODULE_PARM_DESC(ec_max_queries, "Maximum parallel _Qxx evaluations");
+
  static bool ec_busy_polling __read_mostly;
  module_param(ec_busy_polling, bool, 0644);
  MODULE_PARM_DESC(ec_busy_polling, "Use busy polling to advance EC transaction");
@@ -174,6 +179,7 @@ static void acpi_ec_event_processor(struct work_struct *work);
  
  struct acpi_ec *boot_ec, *first_ec;
  EXPORT_SYMBOL(first_ec);
+static struct workqueue_struct *ec_query_wq;
  
  static int EC_FLAGS_CLEAR_ON_RESUME; /* Needs acpi_ec_clear() on boot/resume */
  static int EC_FLAGS_QUERY_HANDSHAKE; /* Needs QR_EC issued when SCI_EVT set */
@@ -1098,7 +1104,7 @@ static int acpi_ec_query(struct acpi_ec *ec, u8 *data)
          * work queue execution.
          */
         ec_dbg_evt("Query(0x%02x) scheduled", value);
-       if (!schedule_work(&q->work)) {
+       if (!queue_work(ec_query_wq, &q->work)) {
                 ec_dbg_evt("Query(0x%02x) overlapped", value);
                 result = -EBUSY;
         }
@@ -1660,15 +1666,41 @@ static struct acpi_driver acpi_ec_driver = {
                 },
  };
  
+static inline int acpi_ec_query_init(void)
+{
+       if (!ec_query_wq) {
+               ec_query_wq = alloc_workqueue("kec_query", 0,
+                                             ec_max_queries);
+               if (!ec_query_wq)
+                       return -ENODEV;
+       }
+       return 0;
+}
+
+static inline void acpi_ec_query_exit(void)
+{
+       if (ec_query_wq) {
+               destroy_workqueue(ec_query_wq);
+               ec_query_wq = NULL;
+       }
+}
+
  int __init acpi_ec_init(void)
  {
-       int result = 0;
+       int result;
  
+       /* register workqueue for _Qxx evaluations */
+       result = acpi_ec_query_init();
+       if (result)
+               goto err_exit;
         /* Now register the driver for the EC */
         result = acpi_bus_register_driver(&acpi_ec_driver);
-       if (result < 0)
-               return -ENODEV;
+       if (result)
+               goto err_exit;
  
+err_exit:
+       if (result)
+               acpi_ec_query_exit();
         return result;
  }
  
@@ -1678,5 +1710,6 @@ static void __exit acpi_ec_exit(void)
  {
  
         acpi_bus_unregister_driver(&acpi_ec_driver);
+       acpi_ec_query_exit();
  }
  #endif /* 0 */
diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c

index 7c04c87738a69d13b28060fe5dfa9b095798709e..df0c70963d9e85a3e24d2e3e8eee9fbc2791e474 100644 (file)
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -402,6 +402,22 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
  }
  EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact);
  
+static noinline struct dev_pm_opp *_find_freq_ceil(struct opp_table *opp_table,
+                                                  unsigned long *freq)
+{
+       struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+       list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
+               if (temp_opp->available && temp_opp->rate >= *freq) {
+                       opp = temp_opp;
+                       *freq = opp->rate;
+                       break;
+               }
+       }
+
+       return opp;
+}
+
  /**
   * dev_pm_opp_find_freq_ceil() - Search for an rounded ceil freq
   * @dev:       device for which we do this operation
@@ -427,7 +443,6 @@ struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
                                              unsigned long *freq)
  {
         struct opp_table *opp_table;
-       struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
  
         opp_rcu_lockdep_assert();
  
@@ -440,15 +455,7 @@ struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
         if (IS_ERR(opp_table))
                 return ERR_CAST(opp_table);
  
-       list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
-               if (temp_opp->available && temp_opp->rate >= *freq) {
-                       opp = temp_opp;
-                       *freq = opp->rate;
-                       break;
-               }
-       }
-
-       return opp;
+       return _find_freq_ceil(opp_table, freq);
  }
  EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil);
  
@@ -612,7 +619,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
                 return PTR_ERR(opp_table);
         }
  
-       old_opp = dev_pm_opp_find_freq_ceil(dev, &old_freq);
+       old_opp = _find_freq_ceil(opp_table, &old_freq);
         if (!IS_ERR(old_opp)) {
                 ou_volt = old_opp->u_volt;
                 ou_volt_min = old_opp->u_volt_min;
@@ -622,7 +629,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
                         __func__, old_freq, PTR_ERR(old_opp));
         }
  
-       opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+       opp = _find_freq_ceil(opp_table, &freq);
         if (IS_ERR(opp)) {
                 ret = PTR_ERR(opp);
                 dev_err(dev, "%s: failed to find OPP for freq %lu (%d)\n",
diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c

index a6975795e7f3c7cb9b815a15355ecb466ac3aa65..efec10b49d59a72234ec3544b63edd9d5ab144c4 100644 (file)
--- a/drivers/base/power/trace.c
+++ b/drivers/base/power/trace.c
@@ -11,7 +11,7 @@
  #include <linux/export.h>
  #include <linux/rtc.h>
  
-#include <asm/rtc.h>
+#include <linux/mc146818rtc.h>
  
  #include "power.h"
  
@@ -103,7 +103,7 @@ static int set_magic_time(unsigned int user, unsigned int file, unsigned int dev
         n /= 24;
         time.tm_min = (n % 20) * 3;
         n /= 20;
-       set_rtc_time(&time);
+       mc146818_set_time(&time);
         return n ? -1 : 0;
  }
  
@@ -112,7 +112,7 @@ static unsigned int read_magic_time(void)
         struct rtc_time time;
         unsigned int val;
  
-       get_rtc_time(&time);
+       mc146818_get_time(&time);
         pr_info("RTC time: %2d:%02d:%02d, date: %02d/%02d/%02d\n",
                 time.tm_hour, time.tm_min, time.tm_sec,
                 time.tm_mon + 1, time.tm_mday, time.tm_year % 100);
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c

index 5fb7718f256cf9d7bd45229dcc6e8a0b2225b1c5..62e4de2aa8d159222bbfce1b7c498a311eace2cb 100644 (file)
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -334,10 +334,9 @@ void device_wakeup_arm_wake_irqs(void)
         struct wakeup_source *ws;
  
         rcu_read_lock();
-       list_for_each_entry_rcu(ws, &wakeup_sources, entry) {
-               if (ws->wakeirq)
-                       dev_pm_arm_wake_irq(ws->wakeirq);
-       }
+       list_for_each_entry_rcu(ws, &wakeup_sources, entry)
+               dev_pm_arm_wake_irq(ws->wakeirq);
+
         rcu_read_unlock();
  }
  
@@ -351,10 +350,9 @@ void device_wakeup_disarm_wake_irqs(void)
         struct wakeup_source *ws;
  
         rcu_read_lock();
-       list_for_each_entry_rcu(ws, &wakeup_sources, entry) {
-               if (ws->wakeirq)
-                       dev_pm_disarm_wake_irq(ws->wakeirq);
-       }
+       list_for_each_entry_rcu(ws, &wakeup_sources, entry)
+               dev_pm_disarm_wake_irq(ws->wakeirq);
+
         rcu_read_unlock();
  }
  
@@ -390,9 +388,7 @@ int device_wakeup_disable(struct device *dev)
                 return -EINVAL;
  
         ws = device_wakeup_detach(dev);
-       if (ws)
-               wakeup_source_unregister(ws);
-
+       wakeup_source_unregister(ws);
         return 0;
  }
  EXPORT_SYMBOL_GPL(device_wakeup_disable);
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig

index fdb8f3e10b6f13cb7831932c2e22463e0e9050e8..dcc09739a54ef860343ac2fca5acf59724ca60e2 100644 (file)
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -293,7 +293,7 @@ if RTC_LIB=n
  
  config RTC
         tristate "Enhanced Real Time Clock Support (legacy PC RTC driver)"
-       depends on ALPHA || (MIPS && MACH_LOONGSON64) || MN10300
+       depends on ALPHA || (MIPS && MACH_LOONGSON64)
         ---help---
           If you say Y here and create a character special file /dev/rtc with
           major number 10 and minor number 135 using mknod ("man mknod"), you
@@ -339,32 +339,6 @@ config JS_RTC
           To compile this driver as a module, choose M here: the
           module will be called js-rtc.
  
-config GEN_RTC
-       tristate "Generic /dev/rtc emulation"
-       depends on RTC!=y
-       depends on ALPHA || M68K || MN10300 || PARISC || PPC || X86
-       ---help---
-         If you say Y here and create a character special file /dev/rtc with
-         major number 10 and minor number 135 using mknod ("man mknod"), you
-         will get access to the real time clock (or hardware clock) built
-         into your computer.
-
-         It reports status information via the file /proc/driver/rtc and its
-         behaviour is set by various ioctls on /dev/rtc. If you enable the
-         "extended RTC operation" below it will also provide an emulation
-         for RTC_UIE which is required by some programs and may improve
-         precision in some cases.
-
-         To compile this driver as a module, choose M here: the
-         module will be called genrtc.
-
-config GEN_RTC_X
-       bool "Extended RTC operation"
-       depends on GEN_RTC
-       help
-         Provides an emulation for RTC_UIE which is required by some programs
-         and may improve precision of the generic RTC support in some cases.
-
  config EFI_RTC
         bool "EFI Real Time Clock Services"
         depends on IA64
diff --git a/drivers/char/Makefile b/drivers/char/Makefile

index 55d16bf3ccc5ce4817e5e2bb1d28619570f9821c..6e6c244a66a02c4efd57229b0f31d331377bbc55 100644 (file)
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -25,7 +25,6 @@ obj-$(CONFIG_APPLICOM)                += applicom.o
  obj-$(CONFIG_SONYPI)           += sonypi.o
  obj-$(CONFIG_RTC)              += rtc.o
  obj-$(CONFIG_HPET)             += hpet.o
-obj-$(CONFIG_GEN_RTC)          += genrtc.o
  obj-$(CONFIG_EFI_RTC)          += efirtc.o
  obj-$(CONFIG_DS1302)           += ds1302.o
  obj-$(CONFIG_XILINX_HWICAP)    += xilinx_hwicap/
diff --git a/drivers/char/genrtc.c b/drivers/char/genrtc.c

deleted file mode 100644 (file)

index 4f94375..0000000
--- a/drivers/char/genrtc.c
+++ /dev/null
@@ -1,539 +0,0 @@
-/*
- *     Real Time Clock interface for
- *             - q40 and other m68k machines,
- *             - HP PARISC machines
- *             - PowerPC machines
- *      emulate some RTC irq capabilities in software
- *
- *      Copyright (C) 1999 Richard Zidlicky
- *
- *     based on Paul Gortmaker's rtc.c device and
- *           Sam Creasey Generic rtc driver
- *
- *     This driver allows use of the real time clock (built into
- *     nearly all computers) from user space. It exports the /dev/rtc
- *     interface supporting various ioctl() and also the /proc/driver/rtc
- *     pseudo-file for status information.
- *
- *     The ioctls can be used to set the interrupt behaviour where
- *     supported.
- *
- *     The /dev/rtc interface will block on reads until an interrupt
- *     has been received. If a RTC interrupt has already happened,
- *     it will output an unsigned long and then block. The output value
- *     contains the interrupt status in the low byte and the number of
- *     interrupts since the last read in the remaining high bytes. The
- *     /dev/rtc interface can also be used with the select(2) call.
- *
- *     This program is free software; you can redistribute it and/or
- *     modify it under the terms of the GNU General Public License
- *     as published by the Free Software Foundation; either version
- *     2 of the License, or (at your option) any later version.
- *
-
- *      1.01 fix for 2.3.X                    rz@linux-m68k.org
- *      1.02 merged with code from genrtc.c   rz@linux-m68k.org
- *      1.03 make it more portable            zippel@linux-m68k.org
- *      1.04 removed useless timer code       rz@linux-m68k.org
- *      1.05 portable RTC_UIE emulation       rz@linux-m68k.org
- *      1.06 set_rtc_time can return an error trini@kernel.crashing.org
- *      1.07 ported to HP PARISC (hppa)              Helge Deller <deller@gmx.de>
- */
-
-#define RTC_VERSION    "1.07"
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/miscdevice.h>
-#include <linux/fcntl.h>
-
-#include <linux/rtc.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/mutex.h>
-#include <linux/workqueue.h>
-
-#include <asm/uaccess.h>
-#include <asm/rtc.h>
-
-/*
- *     We sponge a minor off of the misc major. No need slurping
- *     up another valuable major dev number for this. If you add
- *     an ioctl, make sure you don't conflict with SPARC's RTC
- *     ioctls.
- */
-
-static DEFINE_MUTEX(gen_rtc_mutex);
-static DECLARE_WAIT_QUEUE_HEAD(gen_rtc_wait);
-
-/*
- *     Bits in gen_rtc_status.
- */
-
-#define RTC_IS_OPEN            0x01    /* means /dev/rtc is in use     */
-
-static unsigned char gen_rtc_status;   /* bitmapped status byte.       */
-static unsigned long gen_rtc_irq_data; /* our output to the world      */
-
-/* months start at 0 now */
-static unsigned char days_in_mo[] =
-{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
-
-static int irq_active;
-
-#ifdef CONFIG_GEN_RTC_X
-static struct work_struct genrtc_task;
-static struct timer_list timer_task;
-
-static unsigned int oldsecs;
-static int lostint;
-static unsigned long tt_exp;
-
-static void gen_rtc_timer(unsigned long data);
-
-static volatile int stask_active;              /* schedule_work */
-static volatile int ttask_active;              /* timer_task */
-static int stop_rtc_timers;                    /* don't requeue tasks */
-static DEFINE_SPINLOCK(gen_rtc_lock);
-
-static void gen_rtc_interrupt(unsigned long arg);
-
-/*
- * Routine to poll RTC seconds field for change as often as possible,
- * after first RTC_UIE use timer to reduce polling
- */
-static void genrtc_troutine(struct work_struct *work)
-{
-       unsigned int tmp = get_rtc_ss();
-       
-       if (stop_rtc_timers) {
-               stask_active = 0;
-               return;
-       }
-
-       if (oldsecs != tmp){
-               oldsecs = tmp;
-
-               timer_task.function = gen_rtc_timer;
-               timer_task.expires = jiffies + HZ - (HZ/10);
-               tt_exp=timer_task.expires;
-               ttask_active=1;
-               stask_active=0;
-               add_timer(&timer_task);
-
-               gen_rtc_interrupt(0);
-       } else if (schedule_work(&genrtc_task) == 0)
-               stask_active = 0;
-}
-
-static void gen_rtc_timer(unsigned long data)
-{
-       lostint = get_rtc_ss() - oldsecs ;
-       if (lostint<0) 
-               lostint = 60 - lostint;
-       if (time_after(jiffies, tt_exp))
-               printk(KERN_INFO "genrtc: timer task delayed by %ld jiffies\n",
-                      jiffies-tt_exp);
-       ttask_active=0;
-       stask_active=1;
-       if ((schedule_work(&genrtc_task) == 0))
-               stask_active = 0;
-}
-
-/* 
- * call gen_rtc_interrupt function to signal an RTC_UIE,
- * arg is unused.
- * Could be invoked either from a real interrupt handler or
- * from some routine that periodically (eg 100HZ) monitors
- * whether RTC_SECS changed
- */
-static void gen_rtc_interrupt(unsigned long arg)
-{
-       /*  We store the status in the low byte and the number of
-        *      interrupts received since the last read in the remainder
-        *      of rtc_irq_data.  */
-
-       gen_rtc_irq_data += 0x100;
-       gen_rtc_irq_data &= ~0xff;
-       gen_rtc_irq_data |= RTC_UIE;
-
-       if (lostint){
-               printk("genrtc: system delaying clock ticks?\n");
-               /* increment count so that userspace knows something is wrong */
-               gen_rtc_irq_data += ((lostint-1)<<8);
-               lostint = 0;
-       }
-
-       wake_up_interruptible(&gen_rtc_wait);
-}
-
-/*
- *     Now all the various file operations that we export.
- */
-static ssize_t gen_rtc_read(struct file *file, char __user *buf,
-                       size_t count, loff_t *ppos)
-{
-       unsigned long data;
-       ssize_t retval;
-
-       if (count != sizeof (unsigned int) && count != sizeof (unsigned long))
-               return -EINVAL;
-
-       if (file->f_flags & O_NONBLOCK && !gen_rtc_irq_data)
-               return -EAGAIN;
-
-       retval = wait_event_interruptible(gen_rtc_wait,
-                       (data = xchg(&gen_rtc_irq_data, 0)));
-       if (retval)
-               goto out;
-
-       /* first test allows optimizer to nuke this case for 32-bit machines */
-       if (sizeof (int) != sizeof (long) && count == sizeof (unsigned int)) {
-               unsigned int uidata = data;
-               retval = put_user(uidata, (unsigned int __user *)buf) ?:
-                       sizeof(unsigned int);
-       }
-       else {
-               retval = put_user(data, (unsigned long __user *)buf) ?:
-                       sizeof(unsigned long);
-       }
-out:
-       return retval;
-}
-
-static unsigned int gen_rtc_poll(struct file *file,
-                                struct poll_table_struct *wait)
-{
-       poll_wait(file, &gen_rtc_wait, wait);
-       if (gen_rtc_irq_data != 0)
-               return POLLIN | POLLRDNORM;
-       return 0;
-}
-
-#endif
-
-/*
- * Used to disable/enable interrupts, only RTC_UIE supported
- * We also clear out any old irq data after an ioctl() that
- * meddles with the interrupt enable/disable bits.
- */
-
-static inline void gen_clear_rtc_irq_bit(unsigned char bit)
-{
-#ifdef CONFIG_GEN_RTC_X
-       stop_rtc_timers = 1;
-       if (ttask_active){
-               del_timer_sync(&timer_task);
-               ttask_active = 0;
-       }
-       while (stask_active)
-               schedule();
-
-       spin_lock(&gen_rtc_lock);
-       irq_active = 0;
-       spin_unlock(&gen_rtc_lock);
-#endif
-}
-
-static inline int gen_set_rtc_irq_bit(unsigned char bit)
-{
-#ifdef CONFIG_GEN_RTC_X
-       spin_lock(&gen_rtc_lock);
-       if ( !irq_active ) {
-               irq_active = 1;
-               stop_rtc_timers = 0;
-               lostint = 0;
-               INIT_WORK(&genrtc_task, genrtc_troutine);
-               oldsecs = get_rtc_ss();
-               init_timer(&timer_task);
-
-               stask_active = 1;
-               if (schedule_work(&genrtc_task) == 0){
-                       stask_active = 0;
-               }
-       }
-       spin_unlock(&gen_rtc_lock);
-       gen_rtc_irq_data = 0;
-       return 0;
-#else
-       return -EINVAL;
-#endif
-}
-
-static int gen_rtc_ioctl(struct file *file,
-                        unsigned int cmd, unsigned long arg)
-{
-       struct rtc_time wtime;
-       struct rtc_pll_info pll;
-       void __user *argp = (void __user *)arg;
-
-       switch (cmd) {
-
-       case RTC_PLL_GET:
-           if (get_rtc_pll(&pll))
-                   return -EINVAL;
-           else
-                   return copy_to_user(argp, &pll, sizeof pll) ? -EFAULT : 0;
-
-       case RTC_PLL_SET:
-               if (!capable(CAP_SYS_TIME))
-                       return -EACCES;
-               if (copy_from_user(&pll, argp, sizeof(pll)))
-                       return -EFAULT;
-           return set_rtc_pll(&pll);
-
-       case RTC_UIE_OFF:       /* disable ints from RTC updates.       */
-               gen_clear_rtc_irq_bit(RTC_UIE);
-               return 0;
-
-       case RTC_UIE_ON:        /* enable ints for RTC updates. */
-               return gen_set_rtc_irq_bit(RTC_UIE);
-
-       case RTC_RD_TIME:       /* Read the time/date from RTC  */
-               /* this doesn't get week-day, who cares */
-               memset(&wtime, 0, sizeof(wtime));
-               get_rtc_time(&wtime);
-
-               return copy_to_user(argp, &wtime, sizeof(wtime)) ? -EFAULT : 0;
-
-       case RTC_SET_TIME:      /* Set the RTC */
-           {
-               int year;
-               unsigned char leap_yr;
-
-               if (!capable(CAP_SYS_TIME))
-                       return -EACCES;
-
-               if (copy_from_user(&wtime, argp, sizeof(wtime)))
-                       return -EFAULT;
-
-               year = wtime.tm_year + 1900;
-               leap_yr = ((!(year % 4) && (year % 100)) ||
-                          !(year % 400));
-
-               if ((wtime.tm_mon < 0 || wtime.tm_mon > 11) || (wtime.tm_mday < 1))
-                       return -EINVAL;
-
-               if (wtime.tm_mday < 0 || wtime.tm_mday >
-                   (days_in_mo[wtime.tm_mon] + ((wtime.tm_mon == 1) && leap_yr)))
-                       return -EINVAL;
-
-               if (wtime.tm_hour < 0 || wtime.tm_hour >= 24 ||
-                   wtime.tm_min < 0 || wtime.tm_min >= 60 ||
-                   wtime.tm_sec < 0 || wtime.tm_sec >= 60)
-                       return -EINVAL;
-
-               return set_rtc_time(&wtime);
-           }
-       }
-
-       return -EINVAL;
-}
-
-static long gen_rtc_unlocked_ioctl(struct file *file, unsigned int cmd,
-                                  unsigned long arg)
-{
-       int ret;
-
-       mutex_lock(&gen_rtc_mutex);
-       ret = gen_rtc_ioctl(file, cmd, arg);
-       mutex_unlock(&gen_rtc_mutex);
-
-       return ret;
-}
-
-/*
- *     We enforce only one user at a time here with the open/close.
- *     Also clear the previous interrupt data on an open, and clean
- *     up things on a close.
- */
-
-static int gen_rtc_open(struct inode *inode, struct file *file)
-{
-       mutex_lock(&gen_rtc_mutex);
-       if (gen_rtc_status & RTC_IS_OPEN) {
-               mutex_unlock(&gen_rtc_mutex);
-               return -EBUSY;
-       }
-
-       gen_rtc_status |= RTC_IS_OPEN;
-       gen_rtc_irq_data = 0;
-       irq_active = 0;
-       mutex_unlock(&gen_rtc_mutex);
-
-       return 0;
-}
-
-static int gen_rtc_release(struct inode *inode, struct file *file)
-{
-       /*
-        * Turn off all interrupts once the device is no longer
-        * in use and clear the data.
-        */
-
-       gen_clear_rtc_irq_bit(RTC_PIE|RTC_AIE|RTC_UIE);
-
-       gen_rtc_status &= ~RTC_IS_OPEN;
-       return 0;
-}
-
-
-#ifdef CONFIG_PROC_FS
-
-/*
- *     Info exported via "/proc/driver/rtc".
- */
-
-static int gen_rtc_proc_show(struct seq_file *m, void *v)
-{
-       struct rtc_time tm;
-       unsigned int flags;
-       struct rtc_pll_info pll;
-
-       flags = get_rtc_time(&tm);
-
-       seq_printf(m,
-                    "rtc_time\t: %02d:%02d:%02d\n"
-                    "rtc_date\t: %04d-%02d-%02d\n"
-                    "rtc_epoch\t: %04u\n",
-                    tm.tm_hour, tm.tm_min, tm.tm_sec,
-                    tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, 1900);
-
-       tm.tm_hour = tm.tm_min = tm.tm_sec = 0;
-
-       seq_puts(m, "alarm\t\t: ");
-       if (tm.tm_hour <= 24)
-               seq_printf(m, "%02d:", tm.tm_hour);
-       else
-               seq_puts(m, "**:");
-
-       if (tm.tm_min <= 59)
-               seq_printf(m, "%02d:", tm.tm_min);
-       else
-               seq_puts(m, "**:");
-
-       if (tm.tm_sec <= 59)
-               seq_printf(m, "%02d\n", tm.tm_sec);
-       else
-               seq_puts(m, "**\n");
-
-       seq_printf(m,
-                    "DST_enable\t: %s\n"
-                    "BCD\t\t: %s\n"
-                    "24hr\t\t: %s\n"
-                    "square_wave\t: %s\n"
-                    "alarm_IRQ\t: %s\n"
-                    "update_IRQ\t: %s\n"
-                    "periodic_IRQ\t: %s\n"
-                    "periodic_freq\t: %ld\n"
-                    "batt_status\t: %s\n",
-                    (flags & RTC_DST_EN) ? "yes" : "no",
-                    (flags & RTC_DM_BINARY) ? "no" : "yes",
-                    (flags & RTC_24H) ? "yes" : "no",
-                    (flags & RTC_SQWE) ? "yes" : "no",
-                    (flags & RTC_AIE) ? "yes" : "no",
-                    irq_active ? "yes" : "no",
-                    (flags & RTC_PIE) ? "yes" : "no",
-                    0L /* freq */,
-                    (flags & RTC_BATT_BAD) ? "bad" : "okay");
-       if (!get_rtc_pll(&pll))
-           seq_printf(m,
-                        "PLL adjustment\t: %d\n"
-                        "PLL max +ve adjustment\t: %d\n"
-                        "PLL max -ve adjustment\t: %d\n"
-                        "PLL +ve adjustment factor\t: %d\n"
-                        "PLL -ve adjustment factor\t: %d\n"
-                        "PLL frequency\t: %ld\n",
-                        pll.pll_value,
-                        pll.pll_max,
-                        pll.pll_min,
-                        pll.pll_posmult,
-                        pll.pll_negmult,
-                        pll.pll_clock);
-       return 0;
-}
-
-static int gen_rtc_proc_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, gen_rtc_proc_show, NULL);
-}
-
-static const struct file_operations gen_rtc_proc_fops = {
-       .open           = gen_rtc_proc_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static int __init gen_rtc_proc_init(void)
-{
-       struct proc_dir_entry *r;
-
-       r = proc_create("driver/rtc", 0, NULL, &gen_rtc_proc_fops);
-       if (!r)
-               return -ENOMEM;
-       return 0;
-}
-#else
-static inline int gen_rtc_proc_init(void) { return 0; }
-#endif /* CONFIG_PROC_FS */
-
-
-/*
- *     The various file operations we support.
- */
-
-static const struct file_operations gen_rtc_fops = {
-       .owner          = THIS_MODULE,
-#ifdef CONFIG_GEN_RTC_X
-       .read           = gen_rtc_read,
-       .poll           = gen_rtc_poll,
-#endif
-       .unlocked_ioctl = gen_rtc_unlocked_ioctl,
-       .open           = gen_rtc_open,
-       .release        = gen_rtc_release,
-       .llseek         = noop_llseek,
-};
-
-static struct miscdevice rtc_gen_dev =
-{
-       .minor          = RTC_MINOR,
-       .name           = "rtc",
-       .fops           = &gen_rtc_fops,
-};
-
-static int __init rtc_generic_init(void)
-{
-       int retval;
-
-       printk(KERN_INFO "Generic RTC Driver v%s\n", RTC_VERSION);
-
-       retval = misc_register(&rtc_gen_dev);
-       if (retval < 0)
-               return retval;
-
-       retval = gen_rtc_proc_init();
-       if (retval) {
-               misc_deregister(&rtc_gen_dev);
-               return retval;
-       }
-
-       return 0;
-}
-
-static void __exit rtc_generic_exit(void)
-{
-       remove_proc_entry ("driver/rtc", NULL);
-       misc_deregister(&rtc_gen_dev);
-}
-
-
-module_init(rtc_generic_init);
-module_exit(rtc_generic_exit);
-
-MODULE_AUTHOR("Richard Zidlicky");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_MISCDEV(RTC_MINOR);
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig

index c822d72629d5ca09820eb8b86a183a26fe27f7e8..74919aa81dcb3311765899b976fab44d0ddeb5d2 100644 (file)
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -32,7 +32,6 @@ config CPU_FREQ_BOOST_SW
  
  config CPU_FREQ_STAT
         bool "CPU frequency transition statistics"
-       default y
         help
           Export CPU frequency statistics information through sysfs.
  
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c

index 9ec033b4f2d9d6693a5d0139da92c5a8e2b22ab1..be9eade147f2335c056e0fc3c4297528330961b5 100644 (file)
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1374,6 +1374,8 @@ MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);
  
  static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = {
         ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_params),
+       ICPU(INTEL_FAM6_BROADWELL_X, core_params),
+       ICPU(INTEL_FAM6_SKYLAKE_X, core_params),
         {}
  };
  
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig

index 2137adfbd8c3493de15458504039f2e28b63b624..e9b7dc037ff8774d83be33b9410ffde653304fd5 100644 (file)
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -84,6 +84,7 @@ source "drivers/infiniband/ulp/iser/Kconfig"
  source "drivers/infiniband/ulp/isert/Kconfig"
  
  source "drivers/infiniband/sw/rdmavt/Kconfig"
+source "drivers/infiniband/sw/rxe/Kconfig"
  
  source "drivers/infiniband/hw/hfi1/Kconfig"
  
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c

index ad1b1adcf6f01894f1fc6f3f35eed308d55e269c..e6dfa1bd3defae37e1ee549e10886c2edeece697 100644 (file)
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -68,6 +68,7 @@ MODULE_DESCRIPTION("Generic RDMA CM Agent");
  MODULE_LICENSE("Dual BSD/GPL");
  
  #define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000
  #define CMA_MAX_CM_RETRIES 15
  #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
  #define CMA_IBOE_PACKET_LIFETIME 18
@@ -162,6 +163,14 @@ struct rdma_bind_list {
         unsigned short          port;
  };
  
+struct class_port_info_context {
+       struct ib_class_port_info       *class_port_info;
+       struct ib_device                *device;
+       struct completion               done;
+       struct ib_sa_query              *sa_query;
+       u8                              port_num;
+};
+
  static int cma_ps_alloc(struct net *net, enum rdma_port_space ps,
                         struct rdma_bind_list *bind_list, int snum)
  {
@@ -306,6 +315,7 @@ struct cma_multicast {
         struct sockaddr_storage addr;
         struct kref             mcref;
         bool                    igmp_joined;
+       u8                      join_state;
  };
  
  struct cma_work {
@@ -3752,10 +3762,63 @@ static void cma_set_mgid(struct rdma_id_private *id_priv,
         }
  }
  
+static void cma_query_sa_classport_info_cb(int status,
+                                          struct ib_class_port_info *rec,
+                                          void *context)
+{
+       struct class_port_info_context *cb_ctx = context;
+
+       WARN_ON(!context);
+
+       if (status || !rec) {
+               pr_debug("RDMA CM: %s port %u failed query ClassPortInfo status: %d\n",
+                        cb_ctx->device->name, cb_ctx->port_num, status);
+               goto out;
+       }
+
+       memcpy(cb_ctx->class_port_info, rec, sizeof(struct ib_class_port_info));
+
+out:
+       complete(&cb_ctx->done);
+}
+
+static int cma_query_sa_classport_info(struct ib_device *device, u8 port_num,
+                                      struct ib_class_port_info *class_port_info)
+{
+       struct class_port_info_context *cb_ctx;
+       int ret;
+
+       cb_ctx = kmalloc(sizeof(*cb_ctx), GFP_KERNEL);
+       if (!cb_ctx)
+               return -ENOMEM;
+
+       cb_ctx->device = device;
+       cb_ctx->class_port_info = class_port_info;
+       cb_ctx->port_num = port_num;
+       init_completion(&cb_ctx->done);
+
+       ret = ib_sa_classport_info_rec_query(&sa_client, device, port_num,
+                                            CMA_QUERY_CLASSPORT_INFO_TIMEOUT,
+                                            GFP_KERNEL, cma_query_sa_classport_info_cb,
+                                            cb_ctx, &cb_ctx->sa_query);
+       if (ret < 0) {
+               pr_err("RDMA CM: %s port %u failed to send ClassPortInfo query, ret: %d\n",
+                      device->name, port_num, ret);
+               goto out;
+       }
+
+       wait_for_completion(&cb_ctx->done);
+
+out:
+       kfree(cb_ctx);
+       return ret;
+}
+
  static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
                                  struct cma_multicast *mc)
  {
         struct ib_sa_mcmember_rec rec;
+       struct ib_class_port_info class_port_info;
         struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
         ib_sa_comp_mask comp_mask;
         int ret;
@@ -3774,7 +3837,24 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
         rec.qkey = cpu_to_be32(id_priv->qkey);
         rdma_addr_get_sgid(dev_addr, &rec.port_gid);
         rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
-       rec.join_state = 1;
+       rec.join_state = mc->join_state;
+
+       if (rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) {
+               ret = cma_query_sa_classport_info(id_priv->id.device,
+                                                 id_priv->id.port_num,
+                                                 &class_port_info);
+
+               if (ret)
+                       return ret;
+
+               if (!(ib_get_cpi_capmask2(&class_port_info) &
+                     IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT)) {
+                       pr_warn("RDMA CM: %s port %u Unable to multicast join\n"
+                               "RDMA CM: SM doesn't support Send Only Full Member option\n",
+                               id_priv->id.device->name, id_priv->id.port_num);
+                       return -EOPNOTSUPP;
+               }
+       }
  
         comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
                     IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
@@ -3843,6 +3923,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
         struct sockaddr *addr = (struct sockaddr *)&mc->addr;
         struct net_device *ndev = NULL;
         enum ib_gid_type gid_type;
+       bool send_only;
+
+       send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN);
  
         if (cma_zero_addr((struct sockaddr *)&mc->addr))
                 return -EINVAL;
@@ -3878,10 +3961,12 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
         if (addr->sa_family == AF_INET) {
                 if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
                         mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
-                       err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid,
-                                           true);
-                       if (!err)
-                               mc->igmp_joined = true;
+                       if (!send_only) {
+                               err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid,
+                                                   true);
+                               if (!err)
+                                       mc->igmp_joined = true;
+                       }
                 }
         } else {
                 if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
@@ -3911,7 +3996,7 @@ out1:
  }
  
  int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
-                       void *context)
+                       u8 join_state, void *context)
  {
         struct rdma_id_private *id_priv;
         struct cma_multicast *mc;
@@ -3930,6 +4015,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
         mc->context = context;
         mc->id_priv = id_priv;
         mc->igmp_joined = false;
+       mc->join_state = join_state;
         spin_lock(&id_priv->lock);
         list_add(&mc->list, &id_priv->mc_list);
         spin_unlock(&id_priv->lock);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c

index 5c155fa91eec8380a463ad687a6aa977c10bcefd..760ef603a4684009c0015600a473af530011d1d5 100644 (file)
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -311,6 +311,15 @@ static int read_port_immutable(struct ib_device *device)
         return 0;
  }
  
+void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len)
+{
+       if (dev->get_dev_fw_str)
+               dev->get_dev_fw_str(dev, str, str_len);
+       else
+               str[0] = '\0';
+}
+EXPORT_SYMBOL(ib_get_device_fw_str);
+
  /**
   * ib_register_device - Register an IB device with IB core
   * @device:Device to register
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c

index f0572049d291e8b862d88f5d45a63092cc61ffeb..357624f8b9d31db91c240bca73c22b7ce322b330 100644 (file)
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -183,15 +183,14 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv)
  
  /*
   * Release a reference on cm_id. If the last reference is being
- * released, enable the waiting thread (in iw_destroy_cm_id) to
- * get woken up, and return 1 if a thread is already waiting.
+ * released, free the cm_id and return 1.
   */
  static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
  {
         BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
         if (atomic_dec_and_test(&cm_id_priv->refcount)) {
                 BUG_ON(!list_empty(&cm_id_priv->work_list));
-               complete(&cm_id_priv->destroy_comp);
+               free_cm_id(cm_id_priv);
                 return 1;
         }
  
@@ -208,19 +207,10 @@ static void add_ref(struct iw_cm_id *cm_id)
  static void rem_ref(struct iw_cm_id *cm_id)
  {
         struct iwcm_id_private *cm_id_priv;
-       int cb_destroy;
  
         cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
  
-       /*
-        * Test bit before deref in case the cm_id gets freed on another
-        * thread.
-        */
-       cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-       if (iwcm_deref_id(cm_id_priv) && cb_destroy) {
-               BUG_ON(!list_empty(&cm_id_priv->work_list));
-               free_cm_id(cm_id_priv);
-       }
+       (void)iwcm_deref_id(cm_id_priv);
  }
  
  static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
@@ -370,6 +360,12 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
         wait_event(cm_id_priv->connect_wait,
                    !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
  
+       /*
+        * Since we're deleting the cm_id, drop any events that
+        * might arrive before the last dereference.
+        */
+       set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags);
+
         spin_lock_irqsave(&cm_id_priv->lock, flags);
         switch (cm_id_priv->state) {
         case IW_CM_STATE_LISTEN:
@@ -433,13 +429,7 @@ void iw_destroy_cm_id(struct iw_cm_id *cm_id)
         struct iwcm_id_private *cm_id_priv;
  
         cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
-       BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
-
         destroy_cm_id(cm_id);
-
-       wait_for_completion(&cm_id_priv->destroy_comp);
-
-       free_cm_id(cm_id_priv);
  }
  EXPORT_SYMBOL(iw_destroy_cm_id);
  
@@ -809,10 +799,7 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
         ret = cm_id->cm_handler(cm_id, iw_event);
         if (ret) {
                 iw_cm_reject(cm_id, NULL, 0);
-               set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-               destroy_cm_id(cm_id);
-               if (atomic_read(&cm_id_priv->refcount)==0)
-                       free_cm_id(cm_id_priv);
+               iw_destroy_cm_id(cm_id);
         }
  
  out:
@@ -1000,7 +987,6 @@ static void cm_work_handler(struct work_struct *_work)
         unsigned long flags;
         int empty;
         int ret = 0;
-       int destroy_id;
  
         spin_lock_irqsave(&cm_id_priv->lock, flags);
         empty = list_empty(&cm_id_priv->work_list);
@@ -1013,20 +999,14 @@ static void cm_work_handler(struct work_struct *_work)
                 put_work(work);
                 spin_unlock_irqrestore(&cm_id_priv->lock, flags);
  
-               ret = process_event(cm_id_priv, &levent);
-               if (ret) {
-                       set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-                       destroy_cm_id(&cm_id_priv->id);
-               }
-               BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
-               destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
-               if (iwcm_deref_id(cm_id_priv)) {
-                       if (destroy_id) {
-                               BUG_ON(!list_empty(&cm_id_priv->work_list));
-                               free_cm_id(cm_id_priv);
-                       }
+               if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
+                       ret = process_event(cm_id_priv, &levent);
+                       if (ret)
+                               destroy_cm_id(&cm_id_priv->id);
+               } else
+                       pr_debug("dropping event %d\n", levent.event);
+               if (iwcm_deref_id(cm_id_priv))
                         return;
-               }
                 if (empty)
                         return;
                 spin_lock_irqsave(&cm_id_priv->lock, flags);
diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h

index 3f6cc82564c8b7726be9d31d65fb79db1ed8457a..82c2cd1b0a8043021bf27494714ee2983a6903b9 100644 (file)
--- a/drivers/infiniband/core/iwcm.h
+++ b/drivers/infiniband/core/iwcm.h
@@ -56,7 +56,7 @@ struct iwcm_id_private {
         struct list_head work_free_list;
  };
  
-#define IWCM_F_CALLBACK_DESTROY   1
+#define IWCM_F_DROP_EVENTS       1
  #define IWCM_F_CONNECT_WAIT       2
  
  #endif /* IWCM_H */
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c

index b65e06c560d7ddafa8c53292a5f6e36c917a9244..ade71e7f01313b7634a074885a3900dea087425b 100644 (file)
--- a/drivers/infiniband/core/iwpm_util.c
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -37,6 +37,7 @@
  #define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1)
  #define IWPM_REMINFO_HASH_SIZE 64
  #define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1)
+#define IWPM_MSG_SIZE          512
  
  static LIST_HEAD(iwpm_nlmsg_req_list);
  static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock);
@@ -452,7 +453,7 @@ struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
  {
         struct sk_buff *skb = NULL;
  
-       skb = dev_alloc_skb(NLMSG_GOODSIZE);
+       skb = dev_alloc_skb(IWPM_MSG_SIZE);
         if (!skb) {
                 pr_err("%s Unable to allocate skb\n", __func__);
                 goto create_nlmsg_exit;
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c

index a83ec28a147b4884492fa960fd721568791553f2..3a3c5d73bbfc833a89c6143f0e732441c8488ab8 100644 (file)
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -93,18 +93,6 @@ enum {
  
  struct mcast_member;
  
-/*
-* There are 4 types of join states:
-* FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember.
-*/
-enum {
-       FULLMEMBER_JOIN,
-       NONMEMBER_JOIN,
-       SENDONLY_NONMEBER_JOIN,
-       SENDONLY_FULLMEMBER_JOIN,
-       NUM_JOIN_MEMBERSHIP_TYPES,
-};
-
  struct mcast_group {
         struct ib_sa_mcmember_rec rec;
         struct rb_node          node;
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c

index 9b8c20c8209bcfa6deb62a20a1f598592e4b7f9b..10469b0088b500fbe2012bc4bd43d249dad3b705 100644 (file)
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -229,7 +229,10 @@ static void ibnl_rcv(struct sk_buff *skb)
  int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
                         __u32 pid)
  {
-       return nlmsg_unicast(nls, skb, pid);
+       int err;
+
+       err = netlink_unicast(nls, skb, pid, 0);
+       return (err < 0) ? err : 0;
  }
  EXPORT_SYMBOL(ibnl_unicast);
  
@@ -252,6 +255,7 @@ int __init ibnl_init(void)
                 return -ENOMEM;
         }
  
+       nls->sk_sndtimeo = 10 * HZ;
         return 0;
  }
  
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c

index 1eb9b1294a6383c689363bb02a713ee1cd660037..dbfd854c32c936b8e934bcc1cb27ce4325b61d8d 100644 (file)
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -58,19 +58,13 @@ static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
         return false;
  }
  
-static inline u32 rdma_rw_max_sge(struct ib_device *dev,
-               enum dma_data_direction dir)
-{
-       return dir == DMA_TO_DEVICE ?
-               dev->attrs.max_sge : dev->attrs.max_sge_rd;
-}
-
  static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
  {
         /* arbitrary limit to avoid allocating gigantic resources */
         return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
  }
  
+/* Caller must have zero-initialized *reg. */
  static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
                 struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
                 u32 sg_cnt, u32 offset)
@@ -114,6 +108,7 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                 u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
                 u64 remote_addr, u32 rkey, enum dma_data_direction dir)
  {
+       struct rdma_rw_reg_ctx *prev = NULL;
         u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
         int i, j, ret = 0, count = 0;
  
@@ -125,7 +120,6 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
         }
  
         for (i = 0; i < ctx->nr_ops; i++) {
-               struct rdma_rw_reg_ctx *prev = i ? &ctx->reg[i - 1] : NULL;
                 struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
                 u32 nents = min(sg_cnt, pages_per_mr);
  
@@ -162,9 +156,13 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                 sg_cnt -= nents;
                 for (j = 0; j < nents; j++)
                         sg = sg_next(sg);
+               prev = reg;
                 offset = 0;
         }
  
+       if (prev)
+               prev->wr.wr.next = NULL;
+
         ctx->type = RDMA_RW_MR;
         return count;
  
@@ -181,7 +179,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                 u64 remote_addr, u32 rkey, enum dma_data_direction dir)
  {
         struct ib_device *dev = qp->pd->device;
-       u32 max_sge = rdma_rw_max_sge(dev, dir);
+       u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
+                     qp->max_read_sge;
         struct ib_sge *sge;
         u32 total_len = 0, i, j;
  
@@ -205,11 +204,10 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                         rdma_wr->wr.opcode = IB_WR_RDMA_READ;
                 rdma_wr->remote_addr = remote_addr + total_len;
                 rdma_wr->rkey = rkey;
+               rdma_wr->wr.num_sge = nr_sge;
                 rdma_wr->wr.sg_list = sge;
  
                 for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
-                       rdma_wr->wr.num_sge++;
-
                         sge->addr = ib_sg_dma_address(dev, sg) + offset;
                         sge->length = ib_sg_dma_len(dev, sg) - offset;
                         sge->lkey = qp->pd->local_dma_lkey;
@@ -220,8 +218,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                         offset = 0;
                 }
  
-               if (i + 1 < ctx->nr_ops)
-                       rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr;
+               rdma_wr->wr.next = i + 1 < ctx->nr_ops ?
+                       &ctx->map.wrs[i + 1].wr : NULL;
         }
  
         ctx->type = RDMA_RW_MULTI_WR;
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c

index e95538650dc6fda8b2d413edfabc60eaad116c28..b9bf7aa055e76fde9144964ce7b2af86329dff89 100644 (file)
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -65,10 +65,17 @@ struct ib_sa_sm_ah {
         u8                   src_path_mask;
  };
  
+struct ib_sa_classport_cache {
+       bool valid;
+       struct ib_class_port_info data;
+};
+
  struct ib_sa_port {
         struct ib_mad_agent *agent;
         struct ib_sa_sm_ah  *sm_ah;
         struct work_struct   update_task;
+       struct ib_sa_classport_cache classport_info;
+       spinlock_t                   classport_lock; /* protects class port info set */
         spinlock_t           ah_lock;
         u8                   port_num;
  };
@@ -998,6 +1005,13 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event
                 port->sm_ah = NULL;
                 spin_unlock_irqrestore(&port->ah_lock, flags);
  
+               if (event->event == IB_EVENT_SM_CHANGE ||
+                   event->event == IB_EVENT_CLIENT_REREGISTER ||
+                   event->event == IB_EVENT_LID_CHANGE) {
+                       spin_lock_irqsave(&port->classport_lock, flags);
+                       port->classport_info.valid = false;
+                       spin_unlock_irqrestore(&port->classport_lock, flags);
+               }
                 queue_work(ib_wq, &sa_dev->port[event->element.port_num -
                                             sa_dev->start_port].update_task);
         }
@@ -1719,6 +1733,7 @@ static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
                                               int status,
                                               struct ib_sa_mad *mad)
  {
+       unsigned long flags;
         struct ib_sa_classport_info_query *query =
                 container_of(sa_query, struct ib_sa_classport_info_query, sa_query);
  
@@ -1728,6 +1743,16 @@ static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
                 ib_unpack(classport_info_rec_table,
                           ARRAY_SIZE(classport_info_rec_table),
                           mad->data, &rec);
+
+               spin_lock_irqsave(&sa_query->port->classport_lock, flags);
+               if (!status && !sa_query->port->classport_info.valid) {
+                       memcpy(&sa_query->port->classport_info.data, &rec,
+                              sizeof(sa_query->port->classport_info.data));
+
+                       sa_query->port->classport_info.valid = true;
+               }
+               spin_unlock_irqrestore(&sa_query->port->classport_lock, flags);
+
                 query->callback(status, &rec, query->context);
         } else {
                 query->callback(status, NULL, query->context);
@@ -1754,7 +1779,9 @@ int ib_sa_classport_info_rec_query(struct ib_sa_client *client,
         struct ib_sa_port *port;
         struct ib_mad_agent *agent;
         struct ib_sa_mad *mad;
+       struct ib_class_port_info cached_class_port_info;
         int ret;
+       unsigned long flags;
  
         if (!sa_dev)
                 return -ENODEV;
@@ -1762,6 +1789,17 @@ int ib_sa_classport_info_rec_query(struct ib_sa_client *client,
         port  = &sa_dev->port[port_num - sa_dev->start_port];
         agent = port->agent;
  
+       /* Use cached ClassPortInfo attribute if valid instead of sending mad */
+       spin_lock_irqsave(&port->classport_lock, flags);
+       if (port->classport_info.valid && callback) {
+               memcpy(&cached_class_port_info, &port->classport_info.data,
+                      sizeof(cached_class_port_info));
+               spin_unlock_irqrestore(&port->classport_lock, flags);
+               callback(0, &cached_class_port_info, context);
+               return 0;
+       }
+       spin_unlock_irqrestore(&port->classport_lock, flags);
+
         query = kzalloc(sizeof(*query), gfp_mask);
         if (!query)
                 return -ENOMEM;
@@ -1885,6 +1923,9 @@ static void ib_sa_add_one(struct ib_device *device)
                 sa_dev->port[i].sm_ah    = NULL;
                 sa_dev->port[i].port_num = i + s;
  
+               spin_lock_init(&sa_dev->port[i].classport_lock);
+               sa_dev->port[i].classport_info.valid = false;
+
                 sa_dev->port[i].agent =
                         ib_register_mad_agent(device, i + s, IB_QPT_GSI,
                                               NULL, 0, send_handler,
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c

index 60df4f8e81bed10ac8e9b6149ec4c3ef5c6abf9f..15defefecb4f7ac26c137ea9a0398d666fa9aae9 100644 (file)
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -38,6 +38,7 @@
  #include <linux/stat.h>
  #include <linux/string.h>
  #include <linux/netdevice.h>
+#include <linux/ethtool.h>
  
  #include <rdma/ib_mad.h>
  #include <rdma/ib_pma.h>
@@ -1200,16 +1201,28 @@ static ssize_t set_node_desc(struct device *device,
         return count;
  }
  
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+                          char *buf)
+{
+       struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+       ib_get_device_fw_str(dev, buf, PAGE_SIZE);
+       strlcat(buf, "\n", PAGE_SIZE);
+       return strlen(buf);
+}
+
  static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
  static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
  static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
  static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
  
  static struct device_attribute *ib_class_attributes[] = {
         &dev_attr_node_type,
         &dev_attr_sys_image_guid,
         &dev_attr_node_guid,
-       &dev_attr_node_desc
+       &dev_attr_node_desc,
+       &dev_attr_fw_ver,
  };
  
  static void free_port_list_attributes(struct ib_device *device)
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c

index c0f3826abb30aa09d755650d5055610679c846c8..2825ece91d3c5630d5caa8a22f6bf338fe26a87b 100644 (file)
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -106,6 +106,7 @@ struct ucma_multicast {
         int                     events_reported;
  
         u64                     uid;
+       u8                      join_state;
         struct list_head        list;
         struct sockaddr_storage addr;
  };
@@ -1317,12 +1318,20 @@ static ssize_t ucma_process_join(struct ucma_file *file,
         struct ucma_multicast *mc;
         struct sockaddr *addr;
         int ret;
+       u8 join_state;
  
         if (out_len < sizeof(resp))
                 return -ENOSPC;
  
         addr = (struct sockaddr *) &cmd->addr;
-       if (cmd->reserved || !cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr)))
+       if (!cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr)))
+               return -EINVAL;
+
+       if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER)
+               join_state = BIT(FULLMEMBER_JOIN);
+       else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER)
+               join_state = BIT(SENDONLY_FULLMEMBER_JOIN);
+       else
                 return -EINVAL;
  
         ctx = ucma_get_ctx(file, cmd->id);
@@ -1335,10 +1344,11 @@ static ssize_t ucma_process_join(struct ucma_file *file,
                 ret = -ENOMEM;
                 goto err1;
         }
-
+       mc->join_state = join_state;
         mc->uid = cmd->uid;
         memcpy(&mc->addr, addr, cmd->addr_size);
-       ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc);
+       ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr,
+                                 join_state, mc);
         if (ret)
                 goto err2;
  
@@ -1382,7 +1392,7 @@ static ssize_t ucma_join_ip_multicast(struct ucma_file *file,
         join_cmd.uid = cmd.uid;
         join_cmd.id = cmd.id;
         join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr);
-       join_cmd.reserved = 0;
+       join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER;
         memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size);
  
         return ucma_process_join(file, &join_cmd, out_len);
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h

index 612ccfd39bf98181693f07c5dfcc661dc8191e39..df26a741cda6596b68c8362fd693f37f4f73ad76 100644 (file)
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -116,6 +116,7 @@ struct ib_uverbs_event_file {
  struct ib_uverbs_file {
         struct kref                             ref;
         struct mutex                            mutex;
+       struct mutex                            cleanup_mutex; /* protect cleanup */
         struct ib_uverbs_device                *device;
         struct ib_ucontext                     *ucontext;
         struct ib_event_handler                 event_handler;
@@ -162,6 +163,10 @@ struct ib_uqp_object {
         struct ib_uxrcd_object *uxrcd;
  };
  
+struct ib_uwq_object {
+       struct ib_uevent_object uevent;
+};
+
  struct ib_ucq_object {
         struct ib_uobject       uobject;
         struct ib_uverbs_file  *uverbs_file;
@@ -181,6 +186,8 @@ extern struct idr ib_uverbs_qp_idr;
  extern struct idr ib_uverbs_srq_idr;
  extern struct idr ib_uverbs_xrcd_idr;
  extern struct idr ib_uverbs_rule_idr;
+extern struct idr ib_uverbs_wq_idr;
+extern struct idr ib_uverbs_rwq_ind_tbl_idr;
  
  void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
  
@@ -199,6 +206,7 @@ void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
  void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
  void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
  void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr);
  void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
  void ib_uverbs_event_handler(struct ib_event_handler *handler,
                              struct ib_event *event);
@@ -219,6 +227,7 @@ struct ib_uverbs_flow_spec {
                 struct ib_uverbs_flow_spec_eth     eth;
                 struct ib_uverbs_flow_spec_ipv4    ipv4;
                 struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+               struct ib_uverbs_flow_spec_ipv6    ipv6;
         };
  };
  
@@ -275,5 +284,10 @@ IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
  IB_UVERBS_DECLARE_EX_CMD(query_device);
  IB_UVERBS_DECLARE_EX_CMD(create_cq);
  IB_UVERBS_DECLARE_EX_CMD(create_qp);
+IB_UVERBS_DECLARE_EX_CMD(create_wq);
+IB_UVERBS_DECLARE_EX_CMD(modify_wq);
+IB_UVERBS_DECLARE_EX_CMD(destroy_wq);
+IB_UVERBS_DECLARE_EX_CMD(create_rwq_ind_table);
+IB_UVERBS_DECLARE_EX_CMD(destroy_rwq_ind_table);
  
  #endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c

index 825021d1008b9a13ee5a3b02ed0a1dd7ce80d0ee..f6647318138d77cf7d621dfc9f66515ad9a4b9f9 100644 (file)
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -57,6 +57,8 @@ static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" };
  static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" };
  static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" };
  static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
+static struct uverbs_lock_class wq_lock_class = { .name = "WQ-uobj" };
+static struct uverbs_lock_class rwq_ind_table_lock_class = { .name = "IND_TBL-uobj" };
  
  /*
   * The ib_uobject locking scheme is as follows:
@@ -243,6 +245,27 @@ static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context)
         return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0);
  }
  
+static struct ib_wq *idr_read_wq(int wq_handle, struct ib_ucontext *context)
+{
+       return idr_read_obj(&ib_uverbs_wq_idr, wq_handle, context, 0);
+}
+
+static void put_wq_read(struct ib_wq *wq)
+{
+       put_uobj_read(wq->uobject);
+}
+
+static struct ib_rwq_ind_table *idr_read_rwq_indirection_table(int ind_table_handle,
+                                                              struct ib_ucontext *context)
+{
+       return idr_read_obj(&ib_uverbs_rwq_ind_tbl_idr, ind_table_handle, context, 0);
+}
+
+static void put_rwq_indirection_table_read(struct ib_rwq_ind_table *ind_table)
+{
+       put_uobj_read(ind_table->uobject);
+}
+
  static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context)
  {
         struct ib_uobject *uobj;
@@ -326,6 +349,8 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
         INIT_LIST_HEAD(&ucontext->qp_list);
         INIT_LIST_HEAD(&ucontext->srq_list);
         INIT_LIST_HEAD(&ucontext->ah_list);
+       INIT_LIST_HEAD(&ucontext->wq_list);
+       INIT_LIST_HEAD(&ucontext->rwq_ind_tbl_list);
         INIT_LIST_HEAD(&ucontext->xrcd_list);
         INIT_LIST_HEAD(&ucontext->rule_list);
         rcu_read_lock();
@@ -1750,6 +1775,8 @@ static int create_qp(struct ib_uverbs_file *file,
         struct ib_qp_init_attr          attr = {};
         struct ib_uverbs_ex_create_qp_resp resp;
         int                             ret;
+       struct ib_rwq_ind_table *ind_tbl = NULL;
+       bool has_sq = true;
  
         if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
                 return -EPERM;
@@ -1761,6 +1788,32 @@ static int create_qp(struct ib_uverbs_file *file,
         init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext,
                   &qp_lock_class);
         down_write(&obj->uevent.uobject.mutex);
+       if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) +
+                     sizeof(cmd->rwq_ind_tbl_handle) &&
+                     (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) {
+               ind_tbl = idr_read_rwq_indirection_table(cmd->rwq_ind_tbl_handle,
+                                                        file->ucontext);
+               if (!ind_tbl) {
+                       ret = -EINVAL;
+                       goto err_put;
+               }
+
+               attr.rwq_ind_tbl = ind_tbl;
+       }
+
+       if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) +
+                      sizeof(cmd->reserved1)) && cmd->reserved1) {
+               ret = -EOPNOTSUPP;
+               goto err_put;
+       }
+
+       if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) {
+               ret = -EINVAL;
+               goto err_put;
+       }
+
+       if (ind_tbl && !cmd->max_send_wr)
+               has_sq = false;
  
         if (cmd->qp_type == IB_QPT_XRC_TGT) {
                 xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext,
@@ -1784,20 +1837,24 @@ static int create_qp(struct ib_uverbs_file *file,
                                 }
                         }
  
-                       if (cmd->recv_cq_handle != cmd->send_cq_handle) {
-                               rcq = idr_read_cq(cmd->recv_cq_handle,
-                                                 file->ucontext, 0);
-                               if (!rcq) {
-                                       ret = -EINVAL;
-                                       goto err_put;
+                       if (!ind_tbl) {
+                               if (cmd->recv_cq_handle != cmd->send_cq_handle) {
+                                       rcq = idr_read_cq(cmd->recv_cq_handle,
+                                                         file->ucontext, 0);
+                                       if (!rcq) {
+                                               ret = -EINVAL;
+                                               goto err_put;
+                                       }
                                 }
                         }
                 }
  
-               scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq);
-               rcq = rcq ?: scq;
+               if (has_sq)
+                       scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq);
+               if (!ind_tbl)
+                       rcq = rcq ?: scq;
                 pd  = idr_read_pd(cmd->pd_handle, file->ucontext);
-               if (!pd || !scq) {
+               if (!pd || (!scq && has_sq)) {
                         ret = -EINVAL;
                         goto err_put;
                 }
@@ -1864,16 +1921,20 @@ static int create_qp(struct ib_uverbs_file *file,
                 qp->send_cq       = attr.send_cq;
                 qp->recv_cq       = attr.recv_cq;
                 qp->srq           = attr.srq;
+               qp->rwq_ind_tbl   = ind_tbl;
                 qp->event_handler = attr.event_handler;
                 qp->qp_context    = attr.qp_context;
                 qp->qp_type       = attr.qp_type;
                 atomic_set(&qp->usecnt, 0);
                 atomic_inc(&pd->usecnt);
-               atomic_inc(&attr.send_cq->usecnt);
+               if (attr.send_cq)
+                       atomic_inc(&attr.send_cq->usecnt);
                 if (attr.recv_cq)
                         atomic_inc(&attr.recv_cq->usecnt);
                 if (attr.srq)
                         atomic_inc(&attr.srq->usecnt);
+               if (ind_tbl)
+                       atomic_inc(&ind_tbl->usecnt);
         }
         qp->uobject = &obj->uevent.uobject;
  
@@ -1913,6 +1974,8 @@ static int create_qp(struct ib_uverbs_file *file,
                 put_cq_read(rcq);
         if (srq)
                 put_srq_read(srq);
+       if (ind_tbl)
+               put_rwq_indirection_table_read(ind_tbl);
  
         mutex_lock(&file->mutex);
         list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
@@ -1940,6 +2003,8 @@ err_put:
                 put_cq_read(rcq);
         if (srq)
                 put_srq_read(srq);
+       if (ind_tbl)
+               put_rwq_indirection_table_read(ind_tbl);
  
         put_uobj_write(&obj->uevent.uobject);
         return ret;
@@ -2033,7 +2098,7 @@ int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file,
         if (err)
                 return err;
  
-       if (cmd.comp_mask)
+       if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK)
                 return -EINVAL;
  
         if (cmd.reserved)
@@ -3040,6 +3105,15 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
                 memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask,
                        sizeof(struct ib_flow_ipv4_filter));
                 break;
+       case IB_FLOW_SPEC_IPV6:
+               ib_spec->ipv6.size = sizeof(struct ib_flow_spec_ipv6);
+               if (ib_spec->ipv6.size != kern_spec->ipv6.size)
+                       return -EINVAL;
+               memcpy(&ib_spec->ipv6.val, &kern_spec->ipv6.val,
+                      sizeof(struct ib_flow_ipv6_filter));
+               memcpy(&ib_spec->ipv6.mask, &kern_spec->ipv6.mask,
+                      sizeof(struct ib_flow_ipv6_filter));
+               break;
         case IB_FLOW_SPEC_TCP:
         case IB_FLOW_SPEC_UDP:
                 ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp);
@@ -3056,6 +3130,445 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
         return 0;
  }
  
+int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file,
+                          struct ib_device *ib_dev,
+                          struct ib_udata *ucore,
+                          struct ib_udata *uhw)
+{
+       struct ib_uverbs_ex_create_wq     cmd = {};
+       struct ib_uverbs_ex_create_wq_resp resp = {};
+       struct ib_uwq_object           *obj;
+       int err = 0;
+       struct ib_cq *cq;
+       struct ib_pd *pd;
+       struct ib_wq *wq;
+       struct ib_wq_init_attr wq_init_attr = {};
+       size_t required_cmd_sz;
+       size_t required_resp_len;
+
+       required_cmd_sz = offsetof(typeof(cmd), max_sge) + sizeof(cmd.max_sge);
+       required_resp_len = offsetof(typeof(resp), wqn) + sizeof(resp.wqn);
+
+       if (ucore->inlen < required_cmd_sz)
+               return -EINVAL;
+
+       if (ucore->outlen < required_resp_len)
+               return -ENOSPC;
+
+       if (ucore->inlen > sizeof(cmd) &&
+           !ib_is_udata_cleared(ucore, sizeof(cmd),
+                                ucore->inlen - sizeof(cmd)))
+               return -EOPNOTSUPP;
+
+       err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+       if (err)
+               return err;
+
+       if (cmd.comp_mask)
+               return -EOPNOTSUPP;
+
+       obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+       if (!obj)
+               return -ENOMEM;
+
+       init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext,
+                 &wq_lock_class);
+       down_write(&obj->uevent.uobject.mutex);
+       pd  = idr_read_pd(cmd.pd_handle, file->ucontext);
+       if (!pd) {
+               err = -EINVAL;
+               goto err_uobj;
+       }
+
+       cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+       if (!cq) {
+               err = -EINVAL;
+               goto err_put_pd;
+       }
+
+       wq_init_attr.cq = cq;
+       wq_init_attr.max_sge = cmd.max_sge;
+       wq_init_attr.max_wr = cmd.max_wr;
+       wq_init_attr.wq_context = file;
+       wq_init_attr.wq_type = cmd.wq_type;
+       wq_init_attr.event_handler = ib_uverbs_wq_event_handler;
+       obj->uevent.events_reported = 0;
+       INIT_LIST_HEAD(&obj->uevent.event_list);
+       wq = pd->device->create_wq(pd, &wq_init_attr, uhw);
+       if (IS_ERR(wq)) {
+               err = PTR_ERR(wq);
+               goto err_put_cq;
+       }
+
+       wq->uobject = &obj->uevent.uobject;
+       obj->uevent.uobject.object = wq;
+       wq->wq_type = wq_init_attr.wq_type;
+       wq->cq = cq;
+       wq->pd = pd;
+       wq->device = pd->device;
+       wq->wq_context = wq_init_attr.wq_context;
+       atomic_set(&wq->usecnt, 0);
+       atomic_inc(&pd->usecnt);
+       atomic_inc(&cq->usecnt);
+       wq->uobject = &obj->uevent.uobject;
+       obj->uevent.uobject.object = wq;
+       err = idr_add_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject);
+       if (err)
+               goto destroy_wq;
+
+       memset(&resp, 0, sizeof(resp));
+       resp.wq_handle = obj->uevent.uobject.id;
+       resp.max_sge = wq_init_attr.max_sge;
+       resp.max_wr = wq_init_attr.max_wr;
+       resp.wqn = wq->wq_num;
+       resp.response_length = required_resp_len;
+       err = ib_copy_to_udata(ucore,
+                              &resp, resp.response_length);
+       if (err)
+               goto err_copy;
+
+       put_pd_read(pd);
+       put_cq_read(cq);
+
+       mutex_lock(&file->mutex);
+       list_add_tail(&obj->uevent.uobject.list, &file->ucontext->wq_list);
+       mutex_unlock(&file->mutex);
+
+       obj->uevent.uobject.live = 1;
+       up_write(&obj->uevent.uobject.mutex);
+       return 0;
+
+err_copy:
+       idr_remove_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject);
+destroy_wq:
+       ib_destroy_wq(wq);
+err_put_cq:
+       put_cq_read(cq);
+err_put_pd:
+       put_pd_read(pd);
+err_uobj:
+       put_uobj_write(&obj->uevent.uobject);
+
+       return err;
+}
+
+int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file,
+                           struct ib_device *ib_dev,
+                           struct ib_udata *ucore,
+                           struct ib_udata *uhw)
+{
+       struct ib_uverbs_ex_destroy_wq  cmd = {};
+       struct ib_uverbs_ex_destroy_wq_resp     resp = {};
+       struct ib_wq                    *wq;
+       struct ib_uobject               *uobj;
+       struct ib_uwq_object            *obj;
+       size_t required_cmd_sz;
+       size_t required_resp_len;
+       int                             ret;
+
+       required_cmd_sz = offsetof(typeof(cmd), wq_handle) + sizeof(cmd.wq_handle);
+       required_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+
+       if (ucore->inlen < required_cmd_sz)
+               return -EINVAL;
+
+       if (ucore->outlen < required_resp_len)
+               return -ENOSPC;
+
+       if (ucore->inlen > sizeof(cmd) &&
+           !ib_is_udata_cleared(ucore, sizeof(cmd),
+                                ucore->inlen - sizeof(cmd)))
+               return -EOPNOTSUPP;
+
+       ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+       if (ret)
+               return ret;
+
+       if (cmd.comp_mask)
+               return -EOPNOTSUPP;
+
+       resp.response_length = required_resp_len;
+       uobj = idr_write_uobj(&ib_uverbs_wq_idr, cmd.wq_handle,
+                             file->ucontext);
+       if (!uobj)
+               return -EINVAL;
+
+       wq = uobj->object;
+       obj = container_of(uobj, struct ib_uwq_object, uevent.uobject);
+       ret = ib_destroy_wq(wq);
+       if (!ret)
+               uobj->live = 0;
+
+       put_uobj_write(uobj);
+       if (ret)
+               return ret;
+
+       idr_remove_uobj(&ib_uverbs_wq_idr, uobj);
+
+       mutex_lock(&file->mutex);
+       list_del(&uobj->list);
+       mutex_unlock(&file->mutex);
+
+       ib_uverbs_release_uevent(file, &obj->uevent);
+       resp.events_reported = obj->uevent.events_reported;
+       put_uobj(uobj);
+
+       ret = ib_copy_to_udata(ucore, &resp, resp.response_length);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
+int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file,
+                          struct ib_device *ib_dev,
+                          struct ib_udata *ucore,
+                          struct ib_udata *uhw)
+{
+       struct ib_uverbs_ex_modify_wq cmd = {};
+       struct ib_wq *wq;
+       struct ib_wq_attr wq_attr = {};
+       size_t required_cmd_sz;
+       int ret;
+
+       required_cmd_sz = offsetof(typeof(cmd), curr_wq_state) + sizeof(cmd.curr_wq_state);
+       if (ucore->inlen < required_cmd_sz)
+               return -EINVAL;
+
+       if (ucore->inlen > sizeof(cmd) &&
+           !ib_is_udata_cleared(ucore, sizeof(cmd),
+                                ucore->inlen - sizeof(cmd)))
+               return -EOPNOTSUPP;
+
+       ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+       if (ret)
+               return ret;
+
+       if (!cmd.attr_mask)
+               return -EINVAL;
+
+       if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE))
+               return -EINVAL;
+
+       wq = idr_read_wq(cmd.wq_handle, file->ucontext);
+       if (!wq)
+               return -EINVAL;
+
+       wq_attr.curr_wq_state = cmd.curr_wq_state;
+       wq_attr.wq_state = cmd.wq_state;
+       ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw);
+       put_wq_read(wq);
+       return ret;
+}
+
+int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
+                                     struct ib_device *ib_dev,
+                                     struct ib_udata *ucore,
+                                     struct ib_udata *uhw)
+{
+       struct ib_uverbs_ex_create_rwq_ind_table          cmd = {};
+       struct ib_uverbs_ex_create_rwq_ind_table_resp  resp = {};
+       struct ib_uobject                 *uobj;
+       int err = 0;
+       struct ib_rwq_ind_table_init_attr init_attr = {};
+       struct ib_rwq_ind_table *rwq_ind_tbl;
+       struct ib_wq    **wqs = NULL;
+       u32 *wqs_handles = NULL;
+       struct ib_wq    *wq = NULL;
+       int i, j, num_read_wqs;
+       u32 num_wq_handles;
+       u32 expected_in_size;
+       size_t required_cmd_sz_header;
+       size_t required_resp_len;
+
+       required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size);
+       required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num);
+
+       if (ucore->inlen < required_cmd_sz_header)
+               return -EINVAL;
+
+       if (ucore->outlen < required_resp_len)
+               return -ENOSPC;
+
+       err = ib_copy_from_udata(&cmd, ucore, required_cmd_sz_header);
+       if (err)
+               return err;
+
+       ucore->inbuf += required_cmd_sz_header;
+       ucore->inlen -= required_cmd_sz_header;
+
+       if (cmd.comp_mask)
+               return -EOPNOTSUPP;
+
+       if (cmd.log_ind_tbl_size > IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE)
+               return -EINVAL;
+
+       num_wq_handles = 1 << cmd.log_ind_tbl_size;
+       expected_in_size = num_wq_handles * sizeof(__u32);
+       if (num_wq_handles == 1)
+               /* input size for wq handles is u64 aligned */
+               expected_in_size += sizeof(__u32);
+
+       if (ucore->inlen < expected_in_size)
+               return -EINVAL;
+
+       if (ucore->inlen > expected_in_size &&
+           !ib_is_udata_cleared(ucore, expected_in_size,
+                                ucore->inlen - expected_in_size))
+               return -EOPNOTSUPP;
+
+       wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles),
+                             GFP_KERNEL);
+       if (!wqs_handles)
+               return -ENOMEM;
+
+       err = ib_copy_from_udata(wqs_handles, ucore,
+                                num_wq_handles * sizeof(__u32));
+       if (err)
+               goto err_free;
+
+       wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL);
+       if (!wqs) {
+               err = -ENOMEM;
+               goto  err_free;
+       }
+
+       for (num_read_wqs = 0; num_read_wqs < num_wq_handles;
+                       num_read_wqs++) {
+               wq = idr_read_wq(wqs_handles[num_read_wqs], file->ucontext);
+               if (!wq) {
+                       err = -EINVAL;
+                       goto put_wqs;
+               }
+
+               wqs[num_read_wqs] = wq;
+       }
+
+       uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+       if (!uobj) {
+               err = -ENOMEM;
+               goto put_wqs;
+       }
+
+       init_uobj(uobj, 0, file->ucontext, &rwq_ind_table_lock_class);
+       down_write(&uobj->mutex);
+       init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size;
+       init_attr.ind_tbl = wqs;
+       rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw);
+
+       if (IS_ERR(rwq_ind_tbl)) {
+               err = PTR_ERR(rwq_ind_tbl);
+               goto err_uobj;
+       }
+
+       rwq_ind_tbl->ind_tbl = wqs;
+       rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size;
+       rwq_ind_tbl->uobject = uobj;
+       uobj->object = rwq_ind_tbl;
+       rwq_ind_tbl->device = ib_dev;
+       atomic_set(&rwq_ind_tbl->usecnt, 0);
+
+       for (i = 0; i < num_wq_handles; i++)
+               atomic_inc(&wqs[i]->usecnt);
+
+       err = idr_add_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+       if (err)
+               goto destroy_ind_tbl;
+
+       resp.ind_tbl_handle = uobj->id;
+       resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num;
+       resp.response_length = required_resp_len;
+
+       err = ib_copy_to_udata(ucore,
+                              &resp, resp.response_length);
+       if (err)
+               goto err_copy;
+
+       kfree(wqs_handles);
+
+       for (j = 0; j < num_read_wqs; j++)
+               put_wq_read(wqs[j]);
+
+       mutex_lock(&file->mutex);
+       list_add_tail(&uobj->list, &file->ucontext->rwq_ind_tbl_list);
+       mutex_unlock(&file->mutex);
+
+       uobj->live = 1;
+
+       up_write(&uobj->mutex);
+       return 0;
+
+err_copy:
+       idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+destroy_ind_tbl:
+       ib_destroy_rwq_ind_table(rwq_ind_tbl);
+err_uobj:
+       put_uobj_write(uobj);
+put_wqs:
+       for (j = 0; j < num_read_wqs; j++)
+               put_wq_read(wqs[j]);
+err_free:
+       kfree(wqs_handles);
+       kfree(wqs);
+       return err;
+}
+
+int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file,
+                                      struct ib_device *ib_dev,
+                                      struct ib_udata *ucore,
+                                      struct ib_udata *uhw)
+{
+       struct ib_uverbs_ex_destroy_rwq_ind_table       cmd = {};
+       struct ib_rwq_ind_table *rwq_ind_tbl;
+       struct ib_uobject               *uobj;
+       int                     ret;
+       struct ib_wq    **ind_tbl;
+       size_t required_cmd_sz;
+
+       required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle);
+
+       if (ucore->inlen < required_cmd_sz)
+               return -EINVAL;
+
+       if (ucore->inlen > sizeof(cmd) &&
+           !ib_is_udata_cleared(ucore, sizeof(cmd),
+                                ucore->inlen - sizeof(cmd)))
+               return -EOPNOTSUPP;
+
+       ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+       if (ret)
+               return ret;
+
+       if (cmd.comp_mask)
+               return -EOPNOTSUPP;
+
+       uobj = idr_write_uobj(&ib_uverbs_rwq_ind_tbl_idr, cmd.ind_tbl_handle,
+                             file->ucontext);
+       if (!uobj)
+               return -EINVAL;
+       rwq_ind_tbl = uobj->object;
+       ind_tbl = rwq_ind_tbl->ind_tbl;
+
+       ret = ib_destroy_rwq_ind_table(rwq_ind_tbl);
+       if (!ret)
+               uobj->live = 0;
+
+       put_uobj_write(uobj);
+
+       if (ret)
+               return ret;
+
+       idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+
+       mutex_lock(&file->mutex);
+       list_del(&uobj->list);
+       mutex_unlock(&file->mutex);
+
+       put_uobj(uobj);
+       kfree(ind_tbl);
+       return ret;
+}
+
  int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
                              struct ib_device *ib_dev,
                              struct ib_udata *ucore,
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c

index 31f422a70623a5df98f94acc90a6f6a2d598c2bc..0012fa58c105ded78fa433b89024b8ab176aced0 100644 (file)
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -76,6 +76,8 @@ DEFINE_IDR(ib_uverbs_qp_idr);
  DEFINE_IDR(ib_uverbs_srq_idr);
  DEFINE_IDR(ib_uverbs_xrcd_idr);
  DEFINE_IDR(ib_uverbs_rule_idr);
+DEFINE_IDR(ib_uverbs_wq_idr);
+DEFINE_IDR(ib_uverbs_rwq_ind_tbl_idr);
  
  static DEFINE_SPINLOCK(map_lock);
  static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -130,6 +132,11 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
         [IB_USER_VERBS_EX_CMD_QUERY_DEVICE]     = ib_uverbs_ex_query_device,
         [IB_USER_VERBS_EX_CMD_CREATE_CQ]        = ib_uverbs_ex_create_cq,
         [IB_USER_VERBS_EX_CMD_CREATE_QP]        = ib_uverbs_ex_create_qp,
+       [IB_USER_VERBS_EX_CMD_CREATE_WQ]        = ib_uverbs_ex_create_wq,
+       [IB_USER_VERBS_EX_CMD_MODIFY_WQ]        = ib_uverbs_ex_modify_wq,
+       [IB_USER_VERBS_EX_CMD_DESTROY_WQ]       = ib_uverbs_ex_destroy_wq,
+       [IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL] = ib_uverbs_ex_create_rwq_ind_table,
+       [IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL] = ib_uverbs_ex_destroy_rwq_ind_table,
  };
  
  static void ib_uverbs_add_one(struct ib_device *device);
@@ -265,6 +272,27 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
                 kfree(uqp);
         }
  
+       list_for_each_entry_safe(uobj, tmp, &context->rwq_ind_tbl_list, list) {
+               struct ib_rwq_ind_table *rwq_ind_tbl = uobj->object;
+               struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl;
+
+               idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+               ib_destroy_rwq_ind_table(rwq_ind_tbl);
+               kfree(ind_tbl);
+               kfree(uobj);
+       }
+
+       list_for_each_entry_safe(uobj, tmp, &context->wq_list, list) {
+               struct ib_wq *wq = uobj->object;
+               struct ib_uwq_object *uwq =
+                       container_of(uobj, struct ib_uwq_object, uevent.uobject);
+
+               idr_remove_uobj(&ib_uverbs_wq_idr, uobj);
+               ib_destroy_wq(wq);
+               ib_uverbs_release_uevent(file, &uwq->uevent);
+               kfree(uwq);
+       }
+
         list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) {
                 struct ib_srq *srq = uobj->object;
                 struct ib_uevent_object *uevent =
@@ -568,6 +596,16 @@ void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
                                 &uobj->events_reported);
  }
  
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr)
+{
+       struct ib_uevent_object *uobj = container_of(event->element.wq->uobject,
+                                                 struct ib_uevent_object, uobject);
+
+       ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+                               event->event, &uobj->event_list,
+                               &uobj->events_reported);
+}
+
  void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
  {
         struct ib_uevent_object *uobj;
@@ -931,6 +969,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
         file->async_file = NULL;
         kref_init(&file->ref);
         mutex_init(&file->mutex);
+       mutex_init(&file->cleanup_mutex);
  
         filp->private_data = file;
         kobject_get(&dev->kobj);
@@ -956,18 +995,20 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp)
  {
         struct ib_uverbs_file *file = filp->private_data;
         struct ib_uverbs_device *dev = file->device;
-       struct ib_ucontext *ucontext = NULL;
+
+       mutex_lock(&file->cleanup_mutex);
+       if (file->ucontext) {
+               ib_uverbs_cleanup_ucontext(file, file->ucontext);
+               file->ucontext = NULL;
+       }
+       mutex_unlock(&file->cleanup_mutex);
  
         mutex_lock(&file->device->lists_mutex);
-       ucontext = file->ucontext;
-       file->ucontext = NULL;
         if (!file->is_closed) {
                 list_del(&file->list);
                 file->is_closed = 1;
         }
         mutex_unlock(&file->device->lists_mutex);
-       if (ucontext)
-               ib_uverbs_cleanup_ucontext(file, ucontext);
  
         if (file->async_file)
                 kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
@@ -1181,22 +1222,30 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
         mutex_lock(&uverbs_dev->lists_mutex);
         while (!list_empty(&uverbs_dev->uverbs_file_list)) {
                 struct ib_ucontext *ucontext;
-
                 file = list_first_entry(&uverbs_dev->uverbs_file_list,
                                         struct ib_uverbs_file, list);
                 file->is_closed = 1;
-               ucontext = file->ucontext;
                 list_del(&file->list);
-               file->ucontext = NULL;
                 kref_get(&file->ref);
                 mutex_unlock(&uverbs_dev->lists_mutex);
-               /* We must release the mutex before going ahead and calling
-                * disassociate_ucontext. disassociate_ucontext might end up
-                * indirectly calling uverbs_close, for example due to freeing
-                * the resources (e.g mmput).
-                */
+
                 ib_uverbs_event_handler(&file->event_handler, &event);
+
+               mutex_lock(&file->cleanup_mutex);
+               ucontext = file->ucontext;
+               file->ucontext = NULL;
+               mutex_unlock(&file->cleanup_mutex);
+
+               /* At this point ib_uverbs_close cannot be running
+                * ib_uverbs_cleanup_ucontext
+                */
                 if (ucontext) {
+                       /* We must release the mutex before going ahead and
+                        * calling disassociate_ucontext. disassociate_ucontext
+                        * might end up indirectly calling uverbs_close,
+                        * for example due to freeing the resources
+                        * (e.g mmput).
+                        */
                         ib_dev->disassociate_ucontext(ucontext);
                         ib_uverbs_cleanup_ucontext(file, ucontext);
                 }
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c

index 6298f54b413756a5bf0f19891a080739ec6300ef..f2b776efab3a3ee1ffdc306ef7af01d3093e9ed7 100644 (file)
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -758,6 +758,12 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
         struct ib_qp *qp;
         int ret;
  
+       if (qp_init_attr->rwq_ind_tbl &&
+           (qp_init_attr->recv_cq ||
+           qp_init_attr->srq || qp_init_attr->cap.max_recv_wr ||
+           qp_init_attr->cap.max_recv_sge))
+               return ERR_PTR(-EINVAL);
+
         /*
          * If the callers is using the RDMA API calculate the resources
          * needed for the RDMA READ/WRITE operations.
@@ -775,6 +781,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
         qp->real_qp    = qp;
         qp->uobject    = NULL;
         qp->qp_type    = qp_init_attr->qp_type;
+       qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl;
  
         atomic_set(&qp->usecnt, 0);
         qp->mrs_used = 0;
@@ -792,7 +799,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
                 qp->srq = NULL;
         } else {
                 qp->recv_cq = qp_init_attr->recv_cq;
-               atomic_inc(&qp_init_attr->recv_cq->usecnt);
+               if (qp_init_attr->recv_cq)
+                       atomic_inc(&qp_init_attr->recv_cq->usecnt);
                 qp->srq = qp_init_attr->srq;
                 if (qp->srq)
                         atomic_inc(&qp_init_attr->srq->usecnt);
@@ -803,7 +811,10 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
         qp->xrcd    = NULL;
  
         atomic_inc(&pd->usecnt);
-       atomic_inc(&qp_init_attr->send_cq->usecnt);
+       if (qp_init_attr->send_cq)
+               atomic_inc(&qp_init_attr->send_cq->usecnt);
+       if (qp_init_attr->rwq_ind_tbl)
+               atomic_inc(&qp->rwq_ind_tbl->usecnt);
  
         if (qp_init_attr->cap.max_rdma_ctxs) {
                 ret = rdma_rw_init_mrs(qp, qp_init_attr);
@@ -814,6 +825,15 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
                 }
         }
  
+       /*
+        * Note: all hw drivers guarantee that max_send_sge is lower than
+        * the device RDMA WRITE SGE limit but not all hw drivers ensure that
+        * max_send_sge <= max_sge_rd.
+        */
+       qp->max_write_sge = qp_init_attr->cap.max_send_sge;
+       qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
+                                device->attrs.max_sge_rd);
+
         return qp;
  }
  EXPORT_SYMBOL(ib_create_qp);
@@ -1283,6 +1303,7 @@ int ib_destroy_qp(struct ib_qp *qp)
         struct ib_pd *pd;
         struct ib_cq *scq, *rcq;
         struct ib_srq *srq;
+       struct ib_rwq_ind_table *ind_tbl;
         int ret;
  
         WARN_ON_ONCE(qp->mrs_used > 0);
@@ -1297,6 +1318,7 @@ int ib_destroy_qp(struct ib_qp *qp)
         scq  = qp->send_cq;
         rcq  = qp->recv_cq;
         srq  = qp->srq;
+       ind_tbl = qp->rwq_ind_tbl;
  
         if (!qp->uobject)
                 rdma_rw_cleanup_mrs(qp);
@@ -1311,6 +1333,8 @@ int ib_destroy_qp(struct ib_qp *qp)
                         atomic_dec(&rcq->usecnt);
                 if (srq)
                         atomic_dec(&srq->usecnt);
+               if (ind_tbl)
+                       atomic_dec(&ind_tbl->usecnt);
         }
  
         return ret;
@@ -1558,6 +1582,150 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
  }
  EXPORT_SYMBOL(ib_dealloc_xrcd);
  
+/**
+ * ib_create_wq - Creates a WQ associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the WQ.
+ * @wq_init_attr: A list of initial attributes required to create the
+ * WQ. If WQ creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created WQ.
+ *
+ * wq_init_attr->max_wr and wq_init_attr->max_sge determine
+ * the requested size of the WQ, and set to the actual values allocated
+ * on return.
+ * If ib_create_wq() succeeds, then max_wr and max_sge will always be
+ * at least as large as the requested values.
+ */
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+                          struct ib_wq_init_attr *wq_attr)
+{
+       struct ib_wq *wq;
+
+       if (!pd->device->create_wq)
+               return ERR_PTR(-ENOSYS);
+
+       wq = pd->device->create_wq(pd, wq_attr, NULL);
+       if (!IS_ERR(wq)) {
+               wq->event_handler = wq_attr->event_handler;
+               wq->wq_context = wq_attr->wq_context;
+               wq->wq_type = wq_attr->wq_type;
+               wq->cq = wq_attr->cq;
+               wq->device = pd->device;
+               wq->pd = pd;
+               wq->uobject = NULL;
+               atomic_inc(&pd->usecnt);
+               atomic_inc(&wq_attr->cq->usecnt);
+               atomic_set(&wq->usecnt, 0);
+       }
+       return wq;
+}
+EXPORT_SYMBOL(ib_create_wq);
+
+/**
+ * ib_destroy_wq - Destroys the specified WQ.
+ * @wq: The WQ to destroy.
+ */
+int ib_destroy_wq(struct ib_wq *wq)
+{
+       int err;
+       struct ib_cq *cq = wq->cq;
+       struct ib_pd *pd = wq->pd;
+
+       if (atomic_read(&wq->usecnt))
+               return -EBUSY;
+
+       err = wq->device->destroy_wq(wq);
+       if (!err) {
+               atomic_dec(&pd->usecnt);
+               atomic_dec(&cq->usecnt);
+       }
+       return err;
+}
+EXPORT_SYMBOL(ib_destroy_wq);
+
+/**
+ * ib_modify_wq - Modifies the specified WQ.
+ * @wq: The WQ to modify.
+ * @wq_attr: On input, specifies the WQ attributes to modify.
+ * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ
+ *   are being modified.
+ * On output, the current values of selected WQ attributes are returned.
+ */
+int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+                u32 wq_attr_mask)
+{
+       int err;
+
+       if (!wq->device->modify_wq)
+               return -ENOSYS;
+
+       err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL);
+       return err;
+}
+EXPORT_SYMBOL(ib_modify_wq);
+
+/*
+ * ib_create_rwq_ind_table - Creates a RQ Indirection Table.
+ * @device: The device on which to create the rwq indirection table.
+ * @ib_rwq_ind_table_init_attr: A list of initial attributes required to
+ * create the Indirection Table.
+ *
+ * Note: The life time of ib_rwq_ind_table_init_attr->ind_tbl is not less
+ *     than the created ib_rwq_ind_table object and the caller is responsible
+ *     for its memory allocation/free.
+ */
+struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
+                                                struct ib_rwq_ind_table_init_attr *init_attr)
+{
+       struct ib_rwq_ind_table *rwq_ind_table;
+       int i;
+       u32 table_size;
+
+       if (!device->create_rwq_ind_table)
+               return ERR_PTR(-ENOSYS);
+
+       table_size = (1 << init_attr->log_ind_tbl_size);
+       rwq_ind_table = device->create_rwq_ind_table(device,
+                               init_attr, NULL);
+       if (IS_ERR(rwq_ind_table))
+               return rwq_ind_table;
+
+       rwq_ind_table->ind_tbl = init_attr->ind_tbl;
+       rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size;
+       rwq_ind_table->device = device;
+       rwq_ind_table->uobject = NULL;
+       atomic_set(&rwq_ind_table->usecnt, 0);
+
+       for (i = 0; i < table_size; i++)
+               atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt);
+
+       return rwq_ind_table;
+}
+EXPORT_SYMBOL(ib_create_rwq_ind_table);
+
+/*
+ * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table.
+ * @wq_ind_table: The Indirection Table to destroy.
+*/
+int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table)
+{
+       int err, i;
+       u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size);
+       struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl;
+
+       if (atomic_read(&rwq_ind_table->usecnt))
+               return -EBUSY;
+
+       err = rwq_ind_table->device->destroy_rwq_ind_table(rwq_ind_table);
+       if (!err) {
+               for (i = 0; i < table_size; i++)
+                       atomic_dec(&ind_tbl[i]->usecnt);
+       }
+
+       return err;
+}
+EXPORT_SYMBOL(ib_destroy_rwq_ind_table);
+
  struct ib_flow *ib_create_flow(struct ib_qp *qp,
                                struct ib_flow_attr *flow_attr,
                                int domain)
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c

index 3e8431b5cad733d778b27e481bd44ade8a50c35c..04bbf172abde10a090678f62cfb4c3d48b044c1b 100644 (file)
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -1396,10 +1396,10 @@ static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
         state_set(&child_ep->com, CONNECTING);
         child_ep->com.tdev = tdev;
         child_ep->com.cm_id = NULL;
-       child_ep->com.local_addr.sin_family = PF_INET;
+       child_ep->com.local_addr.sin_family = AF_INET;
         child_ep->com.local_addr.sin_port = req->local_port;
         child_ep->com.local_addr.sin_addr.s_addr = req->local_ip;
-       child_ep->com.remote_addr.sin_family = PF_INET;
+       child_ep->com.remote_addr.sin_family = AF_INET;
         child_ep->com.remote_addr.sin_port = req->peer_port;
         child_ep->com.remote_addr.sin_addr.s_addr = req->peer_ip;
         get_ep(&parent_ep->com);
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c

index bb1a839d4d6d43af7c36f881c2be14145fa5d9d7..3edb80644b53101d76d3de07cd3b3603fb49cea1 100644 (file)
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -1183,18 +1183,6 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
         return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
  }
  
-static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, char *buf)
-{
-       struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
-                                                ibdev.dev);
-       struct ethtool_drvinfo info;
-       struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
-
-       PDBG("%s dev 0x%p\n", __func__, dev);
-       lldev->ethtool_ops->get_drvinfo(lldev, &info);
-       return sprintf(buf, "%s\n", info.fw_version);
-}
-
  static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
                         char *buf)
  {
@@ -1334,13 +1322,11 @@ static int iwch_get_mib(struct ib_device *ibdev, struct rdma_hw_stats *stats,
  }
  
  static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
  static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
  static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
  
  static struct device_attribute *iwch_class_attributes[] = {
         &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
         &dev_attr_hca_type,
         &dev_attr_board_id,
  };
@@ -1362,6 +1348,18 @@ static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num,
         return 0;
  }
  
+static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str,
+                              size_t str_len)
+{
+       struct iwch_dev *iwch_dev = to_iwch_dev(ibdev);
+       struct ethtool_drvinfo info;
+       struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+
+       PDBG("%s dev 0x%p\n", __func__, iwch_dev);
+       lldev->ethtool_ops->get_drvinfo(lldev, &info);
+       snprintf(str, str_len, "%s", info.fw_version);
+}
+
  int iwch_register_device(struct iwch_dev *dev)
  {
         int ret;
@@ -1437,6 +1435,7 @@ int iwch_register_device(struct iwch_dev *dev)
         dev->ibdev.get_hw_stats = iwch_get_mib;
         dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
         dev->ibdev.get_port_immutable = iwch_port_immutable;
+       dev->ibdev.get_dev_fw_str = get_dev_fw_ver_str;
  
         dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
         if (!dev->ibdev.iwcm)
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c

index a3a67216bce6748423ef0c4b33ed32740d54e29c..3aca7f6171b428eac865567978ff21d59ab39dbf 100644 (file)
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -294,6 +294,25 @@ static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new)
         return;
  }
  
+static int alloc_ep_skb_list(struct sk_buff_head *ep_skb_list, int size)
+{
+       struct sk_buff *skb;
+       unsigned int i;
+       size_t len;
+
+       len = roundup(sizeof(union cpl_wr_size), 16);
+       for (i = 0; i < size; i++) {
+               skb = alloc_skb(len, GFP_KERNEL);
+               if (!skb)
+                       goto fail;
+               skb_queue_tail(ep_skb_list, skb);
+       }
+       return 0;
+fail:
+       skb_queue_purge(ep_skb_list);
+       return -ENOMEM;
+}
+
  static void *alloc_ep(int size, gfp_t gfp)
  {
         struct c4iw_ep_common *epc;
@@ -384,6 +403,8 @@ void _c4iw_free_ep(struct kref *kref)
                 if (ep->mpa_skb)
                         kfree_skb(ep->mpa_skb);
         }
+       if (!skb_queue_empty(&ep->com.ep_skb_list))
+               skb_queue_purge(&ep->com.ep_skb_list);
         kfree(ep);
  }
  
@@ -620,25 +641,27 @@ static void abort_arp_failure(void *handle, struct sk_buff *skb)
         }
  }
  
-static int send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
+static int send_flowc(struct c4iw_ep *ep)
  {
-       unsigned int flowclen = 80;
         struct fw_flowc_wr *flowc;
+       struct sk_buff *skb = skb_dequeue(&ep->com.ep_skb_list);
         int i;
         u16 vlan = ep->l2t->vlan;
         int nparams;
  
+       if (WARN_ON(!skb))
+               return -ENOMEM;
+
         if (vlan == CPL_L2T_VLAN_NONE)
                 nparams = 8;
         else
                 nparams = 9;
  
-       skb = get_skb(skb, flowclen, GFP_KERNEL);
-       flowc = (struct fw_flowc_wr *)__skb_put(skb, flowclen);
+       flowc = (struct fw_flowc_wr *)__skb_put(skb, FLOWC_LEN);
  
         flowc->op_to_nparams = cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) |
                                            FW_FLOWC_WR_NPARAMS_V(nparams));
-       flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(flowclen,
+       flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(FLOWC_LEN,
                                           16)) | FW_WR_FLOWID_V(ep->hwtid));
  
         flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
@@ -679,18 +702,16 @@ static int send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
         return c4iw_ofld_send(&ep->com.dev->rdev, skb);
  }
  
-static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp)
+static int send_halfclose(struct c4iw_ep *ep)
  {
         struct cpl_close_con_req *req;
-       struct sk_buff *skb;
+       struct sk_buff *skb = skb_dequeue(&ep->com.ep_skb_list);
         int wrlen = roundup(sizeof *req, 16);
  
         PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
-       skb = get_skb(NULL, wrlen, gfp);
-       if (!skb) {
-               printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__);
+       if (WARN_ON(!skb))
                 return -ENOMEM;
-       }
+
         set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
         t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
         req = (struct cpl_close_con_req *) skb_put(skb, wrlen);
@@ -701,26 +722,24 @@ static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp)
         return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
  }
  
-static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
+static int send_abort(struct c4iw_ep *ep)
  {
         struct cpl_abort_req *req;
         int wrlen = roundup(sizeof *req, 16);
+       struct sk_buff *req_skb = skb_dequeue(&ep->com.ep_skb_list);
  
         PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
-       skb = get_skb(skb, wrlen, gfp);
-       if (!skb) {
-               printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
-                      __func__);
+       if (WARN_ON(!req_skb))
                 return -ENOMEM;
-       }
-       set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
-       t4_set_arp_err_handler(skb, ep, abort_arp_failure);
-       req = (struct cpl_abort_req *) skb_put(skb, wrlen);
+
+       set_wr_txq(req_skb, CPL_PRIORITY_DATA, ep->txq_idx);
+       t4_set_arp_err_handler(req_skb, ep, abort_arp_failure);
+       req = (struct cpl_abort_req *)skb_put(req_skb, wrlen);
         memset(req, 0, wrlen);
         INIT_TP_WR(req, ep->hwtid);
         OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid));
         req->cmd = CPL_ABORT_SEND_RST;
-       return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+       return c4iw_l2t_send(&ep->com.dev->rdev, req_skb, ep->l2t);
  }
  
  static void best_mtu(const unsigned short *mtus, unsigned short mtu,
@@ -992,9 +1011,19 @@ static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
  
         mpa = (struct mpa_message *)(req + 1);
         memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
-       mpa->flags = (crc_enabled ? MPA_CRC : 0) |
-                    (markers_enabled ? MPA_MARKERS : 0) |
-                    (mpa_rev_to_use == 2 ? MPA_ENHANCED_RDMA_CONN : 0);
+
+       mpa->flags = 0;
+       if (crc_enabled)
+               mpa->flags |= MPA_CRC;
+       if (markers_enabled) {
+               mpa->flags |= MPA_MARKERS;
+               ep->mpa_attr.recv_marker_enabled = 1;
+       } else {
+               ep->mpa_attr.recv_marker_enabled = 0;
+       }
+       if (mpa_rev_to_use == 2)
+               mpa->flags |= MPA_ENHANCED_RDMA_CONN;
+
         mpa->private_data_size = htons(ep->plen);
         mpa->revision = mpa_rev_to_use;
         if (mpa_rev_to_use == 1) {
@@ -1169,8 +1198,11 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
         mpa = (struct mpa_message *)(req + 1);
         memset(mpa, 0, sizeof(*mpa));
         memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
-       mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
-                    (markers_enabled ? MPA_MARKERS : 0);
+       mpa->flags = 0;
+       if (ep->mpa_attr.crc_enabled)
+               mpa->flags |= MPA_CRC;
+       if (ep->mpa_attr.recv_marker_enabled)
+               mpa->flags |= MPA_MARKERS;
         mpa->revision = ep->mpa_attr.version;
         mpa->private_data_size = htons(plen);
  
@@ -1248,7 +1280,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
         set_bit(ACT_ESTAB, &ep->com.history);
  
         /* start MPA negotiation */
-       ret = send_flowc(ep, NULL);
+       ret = send_flowc(ep);
         if (ret)
                 goto err;
         if (ep->retry_with_mpa_v1)
@@ -1555,7 +1587,6 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
          */
         __state_set(&ep->com, FPDU_MODE);
         ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
-       ep->mpa_attr.recv_marker_enabled = markers_enabled;
         ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
         ep->mpa_attr.version = mpa->revision;
         ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
@@ -2004,12 +2035,17 @@ static int send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
  }
  
  /*
- * Return whether a failed active open has allocated a TID
+ * Some of the error codes above implicitly indicate that there is no TID
+ * allocated with the result of an ACT_OPEN.  We use this predicate to make
+ * that explicit.
   */
  static inline int act_open_has_tid(int status)
  {
-       return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
-              status != CPL_ERR_ARP_MISS;
+       return (status != CPL_ERR_TCAM_PARITY &&
+               status != CPL_ERR_TCAM_MISS &&
+               status != CPL_ERR_TCAM_FULL &&
+               status != CPL_ERR_CONN_EXIST_SYNRECV &&
+               status != CPL_ERR_CONN_EXIST);
  }
  
  /* Returns whether a CPL status conveys negative advice.
@@ -2130,6 +2166,7 @@ out:
  static int c4iw_reconnect(struct c4iw_ep *ep)
  {
         int err = 0;
+       int size = 0;
         struct sockaddr_in *laddr = (struct sockaddr_in *)
                                     &ep->com.cm_id->m_local_addr;
         struct sockaddr_in *raddr = (struct sockaddr_in *)
@@ -2145,6 +2182,21 @@ static int c4iw_reconnect(struct c4iw_ep *ep)
         init_timer(&ep->timer);
         c4iw_init_wr_wait(&ep->com.wr_wait);
  
+       /* When MPA revision is different on nodes, the node with MPA_rev=2
+        * tries to reconnect with MPA_rev 1 for the same EP through
+        * c4iw_reconnect(), where the same EP is assigned with new tid for
+        * further connection establishment. As we are using the same EP pointer
+        * for reconnect, few skbs are used during the previous c4iw_connect(),
+        * which leaves the EP with inadequate skbs for further
+        * c4iw_reconnect(), Further causing an assert BUG_ON() due to empty
+        * skb_list() during peer_abort(). Allocate skbs which is already used.
+        */
+       size = (CN_MAX_CON_BUF - skb_queue_len(&ep->com.ep_skb_list));
+       if (alloc_ep_skb_list(&ep->com.ep_skb_list, size)) {
+               err = -ENOMEM;
+               goto fail1;
+       }
+
         /*
          * Allocate an active TID to initiate a TCP connection.
          */
@@ -2210,6 +2262,7 @@ fail2:
          * response of 1st connect request.
          */
         connect_reply_upcall(ep, -ECONNRESET);
+fail1:
         c4iw_put_ep(&ep->com);
  out:
         return err;
@@ -2576,6 +2629,10 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
         if (peer_mss && child_ep->mtu > (peer_mss + hdrs))
                 child_ep->mtu = peer_mss + hdrs;
  
+       skb_queue_head_init(&child_ep->com.ep_skb_list);
+       if (alloc_ep_skb_list(&child_ep->com.ep_skb_list, CN_MAX_CON_BUF))
+               goto fail;
+
         state_set(&child_ep->com, CONNECTING);
         child_ep->com.dev = dev;
         child_ep->com.cm_id = NULL;
@@ -2640,6 +2697,8 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
                                (const u32 *)&sin6->sin6_addr.s6_addr, 1);
         }
         goto out;
+fail:
+       c4iw_put_ep(&child_ep->com);
  reject:
         reject_cr(dev, hwtid, skb);
         if (parent_ep)
@@ -2670,7 +2729,7 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
         ep->com.state = MPA_REQ_WAIT;
         start_ep_timer(ep);
         set_bit(PASS_ESTAB, &ep->com.history);
-       ret = send_flowc(ep, skb);
+       ret = send_flowc(ep);
         mutex_unlock(&ep->com.mutex);
         if (ret)
                 c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
@@ -2871,10 +2930,8 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
         }
         mutex_unlock(&ep->com.mutex);
  
-       rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL);
-       if (!rpl_skb) {
-               printk(KERN_ERR MOD "%s - cannot allocate skb!\n",
-                      __func__);
+       rpl_skb = skb_dequeue(&ep->com.ep_skb_list);
+       if (WARN_ON(!rpl_skb)) {
                 release = 1;
                 goto out;
         }
@@ -3011,9 +3068,9 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb)
                 PDBG("%s last streaming msg ack ep %p tid %u state %u "
                      "initiator %u freeing skb\n", __func__, ep, ep->hwtid,
                      state_read(&ep->com), ep->mpa_attr.initiator ? 1 : 0);
+               mutex_lock(&ep->com.mutex);
                 kfree_skb(ep->mpa_skb);
                 ep->mpa_skb = NULL;
-               mutex_lock(&ep->com.mutex);
                 if (test_bit(STOP_MPA_TIMER, &ep->com.flags))
                         stop_ep_timer(ep);
                 mutex_unlock(&ep->com.mutex);
@@ -3025,9 +3082,9 @@ out:
  
  int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
  {
-       int err = 0;
-       int disconnect = 0;
+       int abort;
         struct c4iw_ep *ep = to_ep(cm_id);
+
         PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
  
         mutex_lock(&ep->com.mutex);
@@ -3038,16 +3095,13 @@ int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
         }
         set_bit(ULP_REJECT, &ep->com.history);
         if (mpa_rev == 0)
-               disconnect = 2;
-       else {
-               err = send_mpa_reject(ep, pdata, pdata_len);
-               disconnect = 1;
-       }
+               abort = 1;
+       else
+               abort = send_mpa_reject(ep, pdata, pdata_len);
         mutex_unlock(&ep->com.mutex);
-       if (disconnect) {
-               stop_ep_timer(ep);
-               err = c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL);
-       }
+
+       stop_ep_timer(ep);
+       c4iw_ep_disconnect(ep, abort != 0, GFP_KERNEL);
         c4iw_put_ep(&ep->com);
         return 0;
  }
@@ -3248,6 +3302,13 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                 err = -ENOMEM;
                 goto out;
         }
+
+       skb_queue_head_init(&ep->com.ep_skb_list);
+       if (alloc_ep_skb_list(&ep->com.ep_skb_list, CN_MAX_CON_BUF)) {
+               err = -ENOMEM;
+               goto fail1;
+       }
+
         init_timer(&ep->timer);
         ep->plen = conn_param->private_data_len;
         if (ep->plen)
@@ -3266,7 +3327,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
         if (!ep->com.qp) {
                 PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn);
                 err = -EINVAL;
-               goto fail1;
+               goto fail2;
         }
         ref_qp(ep);
         PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn,
@@ -3279,7 +3340,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
         if (ep->atid == -1) {
                 printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);
                 err = -ENOMEM;
-               goto fail1;
+               goto fail2;
         }
         insert_handle(dev, &dev->atid_idr, ep, ep->atid);
  
@@ -3303,7 +3364,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                 if (raddr->sin_addr.s_addr == htonl(INADDR_ANY)) {
                         err = pick_local_ipaddrs(dev, cm_id);
                         if (err)
-                               goto fail1;
+                               goto fail2;
                 }
  
                 /* find a route */
@@ -3323,7 +3384,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                 if (ipv6_addr_type(&raddr6->sin6_addr) == IPV6_ADDR_ANY) {
                         err = pick_local_ip6addrs(dev, cm_id);
                         if (err)
-                               goto fail1;
+                               goto fail2;
                 }
  
                 /* find a route */
@@ -3339,14 +3400,14 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
         if (!ep->dst) {
                 printk(KERN_ERR MOD "%s - cannot find route.\n", __func__);
                 err = -EHOSTUNREACH;
-               goto fail2;
+               goto fail3;
         }
  
         err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true,
                         ep->com.dev->rdev.lldi.adapter_type, cm_id->tos);
         if (err) {
                 printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__);
-               goto fail3;
+               goto fail4;
         }
  
         PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n",
@@ -3362,13 +3423,15 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                 goto out;
  
         cxgb4_l2t_release(ep->l2t);
-fail3:
+fail4:
         dst_release(ep->dst);
-fail2:
+fail3:
         remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
         cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
-fail1:
+fail2:
+       skb_queue_purge(&ep->com.ep_skb_list);
         deref_cm_id(&ep->com);
+fail1:
         c4iw_put_ep(&ep->com);
  out:
         return err;
@@ -3461,6 +3524,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
                 err = -ENOMEM;
                 goto fail1;
         }
+       skb_queue_head_init(&ep->com.ep_skb_list);
         PDBG("%s ep %p\n", __func__, ep);
         ep->com.cm_id = cm_id;
         ref_cm_id(&ep->com);
@@ -3577,11 +3641,22 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
         case MPA_REQ_RCVD:
         case MPA_REP_SENT:
         case FPDU_MODE:
+       case CONNECTING:
                 close = 1;
                 if (abrupt)
                         ep->com.state = ABORTING;
                 else {
                         ep->com.state = CLOSING;
+
+                       /*
+                        * if we close before we see the fw4_ack() then we fix
+                        * up the timer state since we're reusing it.
+                        */
+                       if (ep->mpa_skb &&
+                           test_bit(STOP_MPA_TIMER, &ep->com.flags)) {
+                               clear_bit(STOP_MPA_TIMER, &ep->com.flags);
+                               stop_ep_timer(ep);
+                       }
                         start_ep_timer(ep);
                 }
                 set_bit(CLOSE_SENT, &ep->com.flags);
@@ -3611,10 +3686,10 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
                 if (abrupt) {
                         set_bit(EP_DISC_ABORT, &ep->com.history);
                         close_complete_upcall(ep, -ECONNRESET);
-                       ret = send_abort(ep, NULL, gfp);
+                       ret = send_abort(ep);
                 } else {
                         set_bit(EP_DISC_CLOSE, &ep->com.history);
-                       ret = send_halfclose(ep, gfp);
+                       ret = send_halfclose(ep);
                 }
                 if (ret) {
                         set_bit(EP_DISC_FAIL, &ep->com.history);
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c

index b0b9557244582bf4c87c5a389c6c7c5bfb286717..812ab7278b8eec477183f451100b7c900adbeaec 100644 (file)
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -33,19 +33,15 @@
  #include "iw_cxgb4.h"
  
  static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
-                     struct c4iw_dev_ucontext *uctx)
+                     struct c4iw_dev_ucontext *uctx, struct sk_buff *skb)
  {
         struct fw_ri_res_wr *res_wr;
         struct fw_ri_res *res;
         int wr_len;
         struct c4iw_wr_wait wr_wait;
-       struct sk_buff *skb;
         int ret;
  
         wr_len = sizeof *res_wr + sizeof *res;
-       skb = alloc_skb(wr_len, GFP_KERNEL);
-       if (!skb)
-               return -ENOMEM;
         set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
  
         res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len);
@@ -863,7 +859,9 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq)
         ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context)
                                   : NULL;
         destroy_cq(&chp->rhp->rdev, &chp->cq,
-                  ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx);
+                  ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx,
+                  chp->destroy_skb);
+       chp->destroy_skb = NULL;
         kfree(chp);
         return 0;
  }
@@ -879,7 +877,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
         struct c4iw_cq *chp;
         struct c4iw_create_cq_resp uresp;
         struct c4iw_ucontext *ucontext = NULL;
-       int ret;
+       int ret, wr_len;
         size_t memsize, hwentries;
         struct c4iw_mm_entry *mm, *mm2;
  
@@ -896,6 +894,13 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
         if (!chp)
                 return ERR_PTR(-ENOMEM);
  
+       wr_len = sizeof(struct fw_ri_res_wr) + sizeof(struct fw_ri_res);
+       chp->destroy_skb = alloc_skb(wr_len, GFP_KERNEL);
+       if (!chp->destroy_skb) {
+               ret = -ENOMEM;
+               goto err1;
+       }
+
         if (ib_context)
                 ucontext = to_c4iw_ucontext(ib_context);
  
@@ -936,7 +941,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
         ret = create_cq(&rhp->rdev, &chp->cq,
                         ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
         if (ret)
-               goto err1;
+               goto err2;
  
         chp->rhp = rhp;
         chp->cq.size--;                         /* status page */
@@ -947,15 +952,15 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
         init_waitqueue_head(&chp->wait);
         ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid);
         if (ret)
-               goto err2;
+               goto err3;
  
         if (ucontext) {
                 mm = kmalloc(sizeof *mm, GFP_KERNEL);
                 if (!mm)
-                       goto err3;
+                       goto err4;
                 mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
                 if (!mm2)
-                       goto err4;
+                       goto err5;
  
                 uresp.qid_mask = rhp->rdev.cqmask;
                 uresp.cqid = chp->cq.cqid;
@@ -970,7 +975,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
                 ret = ib_copy_to_udata(udata, &uresp,
                                        sizeof(uresp) - sizeof(uresp.reserved));
                 if (ret)
-                       goto err5;
+                       goto err6;
  
                 mm->key = uresp.key;
                 mm->addr = virt_to_phys(chp->cq.queue);
@@ -986,15 +991,18 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
              __func__, chp->cq.cqid, chp, chp->cq.size,
              chp->cq.memsize, (unsigned long long) chp->cq.dma_addr);
         return &chp->ibcq;
-err5:
+err6:
         kfree(mm2);
-err4:
+err5:
         kfree(mm);
-err3:
+err4:
         remove_handle(rhp, &rhp->cqidr, chp->cq.cqid);
-err2:
+err3:
         destroy_cq(&chp->rhp->rdev, &chp->cq,
-                  ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+                  ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
+                  chp->destroy_skb);
+err2:
+       kfree_skb(chp->destroy_skb);
  err1:
         kfree(chp);
         return ERR_PTR(ret);
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c

index ae2e8b23d2dde589059b2c0a9ebdd077e17a3668..071d7332ec061489080a9ef55c1a6cbd9622af4d 100644 (file)
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -317,7 +317,7 @@ static int qp_open(struct inode *inode, struct file *file)
         idr_for_each(&qpd->devp->qpidr, count_idrs, &count);
         spin_unlock_irq(&qpd->devp->lock);
  
-       qpd->bufsize = count * 128;
+       qpd->bufsize = count * 180;
         qpd->buf = vmalloc(qpd->bufsize);
         if (!qpd->buf) {
                 kfree(qpd);
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h

index f6f34a75af271f34a141d305021f2d5cf3318aca..aa47e0ae80bc4c42d7a0009714a26be8e0ff7bbb 100644 (file)
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -384,6 +384,7 @@ struct c4iw_mr {
         struct ib_mr ibmr;
         struct ib_umem *umem;
         struct c4iw_dev *rhp;
+       struct sk_buff *dereg_skb;
         u64 kva;
         struct tpt_attributes attr;
         u64 *mpl;
@@ -400,6 +401,7 @@ static inline struct c4iw_mr *to_c4iw_mr(struct ib_mr *ibmr)
  struct c4iw_mw {
         struct ib_mw ibmw;
         struct c4iw_dev *rhp;
+       struct sk_buff *dereg_skb;
         u64 kva;
         struct tpt_attributes attr;
  };
@@ -412,6 +414,7 @@ static inline struct c4iw_mw *to_c4iw_mw(struct ib_mw *ibmw)
  struct c4iw_cq {
         struct ib_cq ibcq;
         struct c4iw_dev *rhp;
+       struct sk_buff *destroy_skb;
         struct t4_cq cq;
         spinlock_t lock;
         spinlock_t comp_handler_lock;
@@ -472,7 +475,7 @@ struct c4iw_qp {
         struct t4_wq wq;
         spinlock_t lock;
         struct mutex mutex;
-       atomic_t refcnt;
+       struct kref kref;
         wait_queue_head_t wait;
         struct timer_list timer;
         int sq_sig_all;
@@ -789,10 +792,29 @@ enum c4iw_ep_history {
         CM_ID_DEREFED           = 28,
  };
  
+enum conn_pre_alloc_buffers {
+       CN_ABORT_REQ_BUF,
+       CN_ABORT_RPL_BUF,
+       CN_CLOSE_CON_REQ_BUF,
+       CN_DESTROY_BUF,
+       CN_FLOWC_BUF,
+       CN_MAX_CON_BUF
+};
+
+#define FLOWC_LEN 80
+union cpl_wr_size {
+       struct cpl_abort_req abrt_req;
+       struct cpl_abort_rpl abrt_rpl;
+       struct fw_ri_wr ri_req;
+       struct cpl_close_con_req close_req;
+       char flowc_buf[FLOWC_LEN];
+};
+
  struct c4iw_ep_common {
         struct iw_cm_id *cm_id;
         struct c4iw_qp *qp;
         struct c4iw_dev *dev;
+       struct sk_buff_head ep_skb_list;
         enum c4iw_ep_state state;
         struct kref kref;
         struct mutex mutex;
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c

index 55d0651ee4de58931dd76be90105ec6aadbf47ef..0b91b0f4df71cf5265f75b57b5c9b0a8ea1a41cc 100644 (file)
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -59,9 +59,9 @@ static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length)
  }
  
  static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
-                                      u32 len, dma_addr_t data, int wait)
+                                      u32 len, dma_addr_t data,
+                                      int wait, struct sk_buff *skb)
  {
-       struct sk_buff *skb;
         struct ulp_mem_io *req;
         struct ulptx_sgl *sgl;
         u8 wr_len;
@@ -74,9 +74,11 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
                 c4iw_init_wr_wait(&wr_wait);
         wr_len = roundup(sizeof(*req) + sizeof(*sgl), 16);
  
-       skb = alloc_skb(wr_len, GFP_KERNEL);
-       if (!skb)
-               return -ENOMEM;
+       if (!skb) {
+               skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+               if (!skb)
+                       return -ENOMEM;
+       }
         set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
  
         req = (struct ulp_mem_io *)__skb_put(skb, wr_len);
@@ -108,9 +110,8 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
  }
  
  static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
-                                 void *data)
+                                 void *data, struct sk_buff *skb)
  {
-       struct sk_buff *skb;
         struct ulp_mem_io *req;
         struct ulptx_idata *sc;
         u8 wr_len, *to_dp, *from_dp;
@@ -134,9 +135,11 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
                 wr_len = roundup(sizeof *req + sizeof *sc +
                                  roundup(copy_len, T4_ULPTX_MIN_IO), 16);
  
-               skb = alloc_skb(wr_len, GFP_KERNEL);
-               if (!skb)
-                       return -ENOMEM;
+               if (!skb) {
+                       skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+                       if (!skb)
+                               return -ENOMEM;
+               }
                 set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
  
                 req = (struct ulp_mem_io *)__skb_put(skb, wr_len);
@@ -173,6 +176,7 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
                         memset(to_dp + copy_len, 0, T4_ULPTX_MIN_IO -
                                (copy_len % T4_ULPTX_MIN_IO));
                 ret = c4iw_ofld_send(rdev, skb);
+               skb = NULL;
                 if (ret)
                         return ret;
                 len -= C4IW_MAX_INLINE_SIZE;
@@ -182,7 +186,8 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
         return ret;
  }
  
-static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data)
+static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len,
+                              void *data, struct sk_buff *skb)
  {
         u32 remain = len;
         u32 dmalen;
@@ -205,7 +210,7 @@ static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *
                         dmalen = T4_ULPTX_MAX_DMA;
                 remain -= dmalen;
                 ret = _c4iw_write_mem_dma_aligned(rdev, addr, dmalen, daddr,
-                                                !remain);
+                                                !remain, skb);
                 if (ret)
                         goto out;
                 addr += dmalen >> 5;
@@ -213,7 +218,7 @@ static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *
                 daddr += dmalen;
         }
         if (remain)
-               ret = _c4iw_write_mem_inline(rdev, addr, remain, data);
+               ret = _c4iw_write_mem_inline(rdev, addr, remain, data, skb);
  out:
         dma_unmap_single(&rdev->lldi.pdev->dev, save, len, DMA_TO_DEVICE);
         return ret;
@@ -224,23 +229,25 @@ out:
   * If data is NULL, clear len byte of memory to zero.
   */
  static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len,
-                            void *data)
+                            void *data, struct sk_buff *skb)
  {
         if (is_t5(rdev->lldi.adapter_type) && use_dsgl) {
                 if (len > inline_threshold) {
-                       if (_c4iw_write_mem_dma(rdev, addr, len, data)) {
+                       if (_c4iw_write_mem_dma(rdev, addr, len, data, skb)) {
                                 printk_ratelimited(KERN_WARNING
                                                    "%s: dma map"
                                                    " failure (non fatal)\n",
                                                    pci_name(rdev->lldi.pdev));
                                 return _c4iw_write_mem_inline(rdev, addr, len,
-                                                             data);
-                       } else
+                                                             data, skb);
+                       } else {
                                 return 0;
+                       }
                 } else
-                       return _c4iw_write_mem_inline(rdev, addr, len, data);
+                       return _c4iw_write_mem_inline(rdev, addr,
+                                                     len, data, skb);
         } else
-               return _c4iw_write_mem_inline(rdev, addr, len, data);
+               return _c4iw_write_mem_inline(rdev, addr, len, data, skb);
  }
  
  /*
@@ -253,7 +260,8 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
                            u32 *stag, u8 stag_state, u32 pdid,
                            enum fw_ri_stag_type type, enum fw_ri_mem_perms perm,
                            int bind_enabled, u32 zbva, u64 to,
-                          u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr)
+                          u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr,
+                          struct sk_buff *skb)
  {
         int err;
         struct fw_ri_tpte tpt;
@@ -307,7 +315,7 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
         }
         err = write_adapter_mem(rdev, stag_idx +
                                 (rdev->lldi.vr->stag.start >> 5),
-                               sizeof(tpt), &tpt);
+                               sizeof(tpt), &tpt, skb);
  
         if (reset_tpt_entry) {
                 c4iw_put_resource(&rdev->resource.tpt_table, stag_idx);
@@ -327,28 +335,29 @@ static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl,
              __func__, pbl_addr, rdev->lldi.vr->pbl.start,
              pbl_size);
  
-       err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl);
+       err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl, NULL);
         return err;
  }
  
  static int dereg_mem(struct c4iw_rdev *rdev, u32 stag, u32 pbl_size,
-                    u32 pbl_addr)
+                    u32 pbl_addr, struct sk_buff *skb)
  {
         return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0,
-                              pbl_size, pbl_addr);
+                              pbl_size, pbl_addr, skb);
  }
  
  static int allocate_window(struct c4iw_rdev *rdev, u32 * stag, u32 pdid)
  {
         *stag = T4_STAG_UNSET;
         return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_MW, 0, 0, 0,
-                              0UL, 0, 0, 0, 0);
+                              0UL, 0, 0, 0, 0, NULL);
  }
  
-static int deallocate_window(struct c4iw_rdev *rdev, u32 stag)
+static int deallocate_window(struct c4iw_rdev *rdev, u32 stag,
+                            struct sk_buff *skb)
  {
         return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0, 0,
-                              0);
+                              0, skb);
  }
  
  static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid,
@@ -356,7 +365,7 @@ static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid,
  {
         *stag = T4_STAG_UNSET;
         return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_NSMR, 0, 0, 0,
-                              0UL, 0, 0, pbl_size, pbl_addr);
+                              0UL, 0, 0, pbl_size, pbl_addr, NULL);
  }
  
  static int finish_mem_reg(struct c4iw_mr *mhp, u32 stag)
@@ -383,14 +392,16 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
                               mhp->attr.mw_bind_enable, mhp->attr.zbva,
                               mhp->attr.va_fbo, mhp->attr.len ?
                               mhp->attr.len : -1, shift - 12,
-                             mhp->attr.pbl_size, mhp->attr.pbl_addr);
+                             mhp->attr.pbl_size, mhp->attr.pbl_addr, NULL);
         if (ret)
                 return ret;
  
         ret = finish_mem_reg(mhp, stag);
-       if (ret)
+       if (ret) {
                 dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
-                      mhp->attr.pbl_addr);
+                         mhp->attr.pbl_addr, mhp->dereg_skb);
+               mhp->dereg_skb = NULL;
+       }
         return ret;
  }
  
@@ -423,6 +434,12 @@ struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
         if (!mhp)
                 return ERR_PTR(-ENOMEM);
  
+       mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL);
+       if (!mhp->dereg_skb) {
+               ret = -ENOMEM;
+               goto err0;
+       }
+
         mhp->rhp = rhp;
         mhp->attr.pdid = php->pdid;
         mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
@@ -435,7 +452,8 @@ struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
  
         ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, php->pdid,
                               FW_RI_STAG_NSMR, mhp->attr.perms,
-                             mhp->attr.mw_bind_enable, 0, 0, ~0ULL, 0, 0, 0);
+                             mhp->attr.mw_bind_enable, 0, 0, ~0ULL, 0, 0, 0,
+                             NULL);
         if (ret)
                 goto err1;
  
@@ -445,8 +463,10 @@ struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
         return &mhp->ibmr;
  err2:
         dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
-                 mhp->attr.pbl_addr);
+                 mhp->attr.pbl_addr, mhp->dereg_skb);
  err1:
+       kfree_skb(mhp->dereg_skb);
+err0:
         kfree(mhp);
         return ERR_PTR(ret);
  }
@@ -481,11 +501,18 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
         if (!mhp)
                 return ERR_PTR(-ENOMEM);
  
+       mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL);
+       if (!mhp->dereg_skb) {
+               kfree(mhp);
+               return ERR_PTR(-ENOMEM);
+       }
+
         mhp->rhp = rhp;
  
         mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
         if (IS_ERR(mhp->umem)) {
                 err = PTR_ERR(mhp->umem);
+               kfree_skb(mhp->dereg_skb);
                 kfree(mhp);
                 return ERR_PTR(err);
         }
@@ -550,6 +577,7 @@ err_pbl:
  
  err:
         ib_umem_release(mhp->umem);
+       kfree_skb(mhp->dereg_skb);
         kfree(mhp);
         return ERR_PTR(err);
  }
@@ -572,11 +600,16 @@ struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
         mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
         if (!mhp)
                 return ERR_PTR(-ENOMEM);
-       ret = allocate_window(&rhp->rdev, &stag, php->pdid);
-       if (ret) {
-               kfree(mhp);
-               return ERR_PTR(ret);
+
+       mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL);
+       if (!mhp->dereg_skb) {
+               ret = -ENOMEM;
+               goto free_mhp;
         }
+
+       ret = allocate_window(&rhp->rdev, &stag, php->pdid);
+       if (ret)
+               goto free_skb;
         mhp->rhp = rhp;
         mhp->attr.pdid = php->pdid;
         mhp->attr.type = FW_RI_STAG_MW;
@@ -584,12 +617,19 @@ struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
         mmid = (stag) >> 8;
         mhp->ibmw.rkey = stag;
         if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
-               deallocate_window(&rhp->rdev, mhp->attr.stag);
-               kfree(mhp);
-               return ERR_PTR(-ENOMEM);
+               ret = -ENOMEM;
+               goto dealloc_win;
         }
         PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
         return &(mhp->ibmw);
+
+dealloc_win:
+       deallocate_window(&rhp->rdev, mhp->attr.stag, mhp->dereg_skb);
+free_skb:
+       kfree_skb(mhp->dereg_skb);
+free_mhp:
+       kfree(mhp);
+       return ERR_PTR(ret);
  }
  
  int c4iw_dealloc_mw(struct ib_mw *mw)
@@ -602,7 +642,8 @@ int c4iw_dealloc_mw(struct ib_mw *mw)
         rhp = mhp->rhp;
         mmid = (mw->rkey) >> 8;
         remove_handle(rhp, &rhp->mmidr, mmid);
-       deallocate_window(&rhp->rdev, mhp->attr.stag);
+       deallocate_window(&rhp->rdev, mhp->attr.stag, mhp->dereg_skb);
+       kfree_skb(mhp->dereg_skb);
         kfree(mhp);
         PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp);
         return 0;
@@ -666,7 +707,7 @@ struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
         return &(mhp->ibmr);
  err3:
         dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size,
-                      mhp->attr.pbl_addr);
+                 mhp->attr.pbl_addr, mhp->dereg_skb);
  err2:
         c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
                               mhp->attr.pbl_size << 3);
@@ -717,7 +758,7 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr)
                 dma_free_coherent(&mhp->rhp->rdev.lldi.pdev->dev,
                                   mhp->max_mpl_len, mhp->mpl, mhp->mpl_addr);
         dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
-                      mhp->attr.pbl_addr);
+                 mhp->attr.pbl_addr, mhp->dereg_skb);
         if (mhp->attr.pbl_size)
                 c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
                                   mhp->attr.pbl_size << 3);
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c

index dd8a86b726d2a70cf5bbee75e89db437fa33eca7..df127ce6b6ec31772f4b6ce1f4a3e980b2800bf3 100644 (file)
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -409,20 +409,6 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
                        CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type));
  }
  
-static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
-                          char *buf)
-{
-       struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
-                                                ibdev.dev);
-       PDBG("%s dev 0x%p\n", __func__, dev);
-
-       return sprintf(buf, "%u.%u.%u.%u\n",
-                       FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers),
-                       FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers),
-                       FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers),
-                       FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
-}
-
  static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
                         char *buf)
  {
@@ -502,13 +488,11 @@ static int c4iw_get_mib(struct ib_device *ibdev,
  }
  
  static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
  static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
  static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
  
  static struct device_attribute *c4iw_class_attributes[] = {
         &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
         &dev_attr_hca_type,
         &dev_attr_board_id,
  };
@@ -530,6 +514,20 @@ static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num,
         return 0;
  }
  
+static void get_dev_fw_str(struct ib_device *dev, char *str,
+                          size_t str_len)
+{
+       struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+                                                ibdev);
+       PDBG("%s dev 0x%p\n", __func__, dev);
+
+       snprintf(str, str_len, "%u.%u.%u.%u",
+                FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers),
+                FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers),
+                FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers),
+                FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
+}
+
  int c4iw_register_device(struct c4iw_dev *dev)
  {
         int ret;
@@ -605,6 +603,7 @@ int c4iw_register_device(struct c4iw_dev *dev)
         dev->ibdev.get_hw_stats = c4iw_get_mib;
         dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
         dev->ibdev.get_port_immutable = c4iw_port_immutable;
+       dev->ibdev.get_dev_fw_str = get_dev_fw_str;
         dev->ibdev.drain_sq = c4iw_drain_sq;
         dev->ibdev.drain_rq = c4iw_drain_rq;
  
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c

index e8993e49b8b3a6d5a99634c4f7376a761762bda0..edb1172b6f54c14b991c1c3f76b36b612c841ca2 100644 (file)
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -683,17 +683,25 @@ static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr,
         return 0;
  }
  
+void _free_qp(struct kref *kref)
+{
+       struct c4iw_qp *qhp;
+
+       qhp = container_of(kref, struct c4iw_qp, kref);
+       PDBG("%s qhp %p\n", __func__, qhp);
+       kfree(qhp);
+}
+
  void c4iw_qp_add_ref(struct ib_qp *qp)
  {
         PDBG("%s ib_qp %p\n", __func__, qp);
-       atomic_inc(&(to_c4iw_qp(qp)->refcnt));
+       kref_get(&to_c4iw_qp(qp)->kref);
  }
  
  void c4iw_qp_rem_ref(struct ib_qp *qp)
  {
         PDBG("%s ib_qp %p\n", __func__, qp);
-       if (atomic_dec_and_test(&(to_c4iw_qp(qp)->refcnt)))
-               wake_up(&(to_c4iw_qp(qp)->wait));
+       kref_put(&to_c4iw_qp(qp)->kref, _free_qp);
  }
  
  static void add_to_fc_list(struct list_head *head, struct list_head *entry)
@@ -1081,9 +1089,10 @@ static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe,
         PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
              qhp->ep->hwtid);
  
-       skb = alloc_skb(sizeof *wqe, gfp);
-       if (!skb)
+       skb = skb_dequeue(&qhp->ep->com.ep_skb_list);
+       if (WARN_ON(!skb))
                 return;
+
         set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
  
         wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
@@ -1202,9 +1211,10 @@ static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
         PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
              ep->hwtid);
  
-       skb = alloc_skb(sizeof *wqe, GFP_KERNEL);
-       if (!skb)
+       skb = skb_dequeue(&ep->com.ep_skb_list);
+       if (WARN_ON(!skb))
                 return -ENOMEM;
+
         set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
  
         wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
@@ -1592,8 +1602,6 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)
         wait_event(qhp->wait, !qhp->ep);
  
         remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
-       atomic_dec(&qhp->refcnt);
-       wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
  
         spin_lock_irq(&rhp->lock);
         if (!list_empty(&qhp->db_fc_entry))
@@ -1606,8 +1614,9 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)
         destroy_qp(&rhp->rdev, &qhp->wq,
                    ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
  
+       c4iw_qp_rem_ref(ib_qp);
+
         PDBG("%s ib_qp %p qpid 0x%0x\n", __func__, ib_qp, qhp->wq.sq.qid);
-       kfree(qhp);
         return 0;
  }
  
@@ -1704,7 +1713,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
         init_completion(&qhp->rq_drained);
         mutex_init(&qhp->mutex);
         init_waitqueue_head(&qhp->wait);
-       atomic_set(&qhp->refcnt, 1);
+       kref_init(&qhp->kref);
  
         ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
         if (ret)
@@ -1896,12 +1905,20 @@ int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
         return 0;
  }
  
+static void move_qp_to_err(struct c4iw_qp *qp)
+{
+       struct c4iw_qp_attributes attrs = { .next_state = C4IW_QP_STATE_ERROR };
+
+       (void)c4iw_modify_qp(qp->rhp, qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+}
+
  void c4iw_drain_sq(struct ib_qp *ibqp)
  {
         struct c4iw_qp *qp = to_c4iw_qp(ibqp);
         unsigned long flag;
         bool need_to_wait;
  
+       move_qp_to_err(qp);
         spin_lock_irqsave(&qp->lock, flag);
         need_to_wait = !t4_sq_empty(&qp->wq);
         spin_unlock_irqrestore(&qp->lock, flag);
@@ -1916,6 +1933,7 @@ void c4iw_drain_rq(struct ib_qp *ibqp)
         unsigned long flag;
         bool need_to_wait;
  
+       move_qp_to_err(qp);
         spin_lock_irqsave(&qp->lock, flag);
         need_to_wait = !t4_rq_empty(&qp->wq);
         spin_unlock_irqrestore(&qp->lock, flag);
diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig

index a925fb0db70603f3ec64b99c92bbdcf365463fc3..f6ea0881765a1e789d2107479e7245d4cc0640c5 100644 (file)
--- a/drivers/infiniband/hw/hfi1/Kconfig
+++ b/drivers/infiniband/hw/hfi1/Kconfig
@@ -1,9 +1,9 @@
  config INFINIBAND_HFI1
         tristate "Intel OPA Gen1 support"
-       depends on X86_64 && INFINIBAND_RDMAVT
+       depends on X86_64 && INFINIBAND_RDMAVT && I2C
         select MMU_NOTIFIER
         select CRC32
-       default m
+       select I2C_ALGOBIT
         ---help---
         This is a low-level driver for Intel OPA Gen1 adapter.
  config HFI1_DEBUG_SDMA_ORDER
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile

index 9b5382c94b0c86d1a2fa96f8c60e9069bd832b0c..0cf97a09b64b6e780dece48ffda7a02a1126753d 100644 (file)
--- a/drivers/infiniband/hw/hfi1/Makefile
+++ b/drivers/infiniband/hw/hfi1/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
  hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
         eprom.o file_ops.o firmware.o \
         init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
-       qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \
+       qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \
         uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
         verbs_txreq.o
  hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c

index 14d7eeb09be6545f5f21144f0f11ea41c53203a4..79575ee873f21af3361264b711419c267b6a23d0 100644 (file)
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -47,12 +47,18 @@
  #include <linux/topology.h>
  #include <linux/cpumask.h>
  #include <linux/module.h>
+#include <linux/cpumask.h>
  
  #include "hfi.h"
  #include "affinity.h"
  #include "sdma.h"
  #include "trace.h"
  
+struct hfi1_affinity_node_list node_affinity = {
+       .list = LIST_HEAD_INIT(node_affinity.list),
+       .lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock),
+};
+
  /* Name of IRQ types, indexed by enum irq_type */
  static const char * const irq_type_names[] = {
         "SDMA",
@@ -61,6 +67,9 @@ static const char * const irq_type_names[] = {
         "OTHER",
  };
  
+/* Per NUMA node count of HFI devices */
+static unsigned int *hfi1_per_node_cntr;
+
  static inline void init_cpu_mask_set(struct cpu_mask_set *set)
  {
         cpumask_clear(&set->mask);
@@ -69,47 +78,136 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set)
  }
  
  /* Initialize non-HT cpu cores mask */
-int init_real_cpu_mask(struct hfi1_devdata *dd)
+void init_real_cpu_mask(void)
  {
-       struct hfi1_affinity *info;
         int possible, curr_cpu, i, ht;
  
-       info = kzalloc(sizeof(*info), GFP_KERNEL);
-       if (!info)
-               return -ENOMEM;
-
-       cpumask_clear(&info->real_cpu_mask);
+       cpumask_clear(&node_affinity.real_cpu_mask);
  
         /* Start with cpu online mask as the real cpu mask */
-       cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
+       cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
  
         /*
          * Remove HT cores from the real cpu mask.  Do this in two steps below.
          */
-       possible = cpumask_weight(&info->real_cpu_mask);
+       possible = cpumask_weight(&node_affinity.real_cpu_mask);
         ht = cpumask_weight(topology_sibling_cpumask(
-                                       cpumask_first(&info->real_cpu_mask)));
+                               cpumask_first(&node_affinity.real_cpu_mask)));
         /*
          * Step 1.  Skip over the first N HT siblings and use them as the
          * "real" cores.  Assumes that HT cores are not enumerated in
          * succession (except in the single core case).
          */
-       curr_cpu = cpumask_first(&info->real_cpu_mask);
+       curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
         for (i = 0; i < possible / ht; i++)
-               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+               curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
         /*
          * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
          * skip any gaps.
          */
         for (; i < possible; i++) {
-               cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
-               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+               cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
+               curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
+       }
+}
+
+int node_affinity_init(void)
+{
+       int node;
+       struct pci_dev *dev = NULL;
+       const struct pci_device_id *ids = hfi1_pci_tbl;
+
+       cpumask_clear(&node_affinity.proc.used);
+       cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
+
+       node_affinity.proc.gen = 0;
+       node_affinity.num_core_siblings =
+                               cpumask_weight(topology_sibling_cpumask(
+                                       cpumask_first(&node_affinity.proc.mask)
+                                       ));
+       node_affinity.num_online_nodes = num_online_nodes();
+       node_affinity.num_online_cpus = num_online_cpus();
+
+       /*
+        * The real cpu mask is part of the affinity struct but it has to be
+        * initialized early. It is needed to calculate the number of user
+        * contexts in set_up_context_variables().
+        */
+       init_real_cpu_mask();
+
+       hfi1_per_node_cntr = kcalloc(num_possible_nodes(),
+                                    sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
+       if (!hfi1_per_node_cntr)
+               return -ENOMEM;
+
+       while (ids->vendor) {
+               dev = NULL;
+               while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
+                       node = pcibus_to_node(dev->bus);
+                       if (node < 0)
+                               node = numa_node_id();
+
+                       hfi1_per_node_cntr[node]++;
+               }
+               ids++;
         }
  
-       dd->affinity = info;
         return 0;
  }
  
+void node_affinity_destroy(void)
+{
+       struct list_head *pos, *q;
+       struct hfi1_affinity_node *entry;
+
+       spin_lock(&node_affinity.lock);
+       list_for_each_safe(pos, q, &node_affinity.list) {
+               entry = list_entry(pos, struct hfi1_affinity_node,
+                                  list);
+               list_del(pos);
+               kfree(entry);
+       }
+       spin_unlock(&node_affinity.lock);
+       kfree(hfi1_per_node_cntr);
+}
+
+static struct hfi1_affinity_node *node_affinity_allocate(int node)
+{
+       struct hfi1_affinity_node *entry;
+
+       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+       if (!entry)
+               return NULL;
+       entry->node = node;
+       INIT_LIST_HEAD(&entry->list);
+
+       return entry;
+}
+
+/*
+ * It appends an entry to the list.
+ * It *must* be called with node_affinity.lock held.
+ */
+static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
+{
+       list_add_tail(&entry->list, &node_affinity.list);
+}
+
+/* It must be called with node_affinity.lock held */
+static struct hfi1_affinity_node *node_affinity_lookup(int node)
+{
+       struct list_head *pos;
+       struct hfi1_affinity_node *entry;
+
+       list_for_each(pos, &node_affinity.list) {
+               entry = list_entry(pos, struct hfi1_affinity_node, list);
+               if (entry->node == node)
+                       return entry;
+       }
+
+       return NULL;
+}
+
  /*
   * Interrupt affinity.
   *
@@ -121,10 +219,10 @@ int init_real_cpu_mask(struct hfi1_devdata *dd)
   * to the node relative 1 as necessary.
   *
   */
-void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
+int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
  {
         int node = pcibus_to_node(dd->pcidev->bus);
-       struct hfi1_affinity *info = dd->affinity;
+       struct hfi1_affinity_node *entry;
         const struct cpumask *local_mask;
         int curr_cpu, possible, i;
  
@@ -132,56 +230,93 @@ void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
                 node = numa_node_id();
         dd->node = node;
  
-       spin_lock_init(&info->lock);
-
-       init_cpu_mask_set(&info->def_intr);
-       init_cpu_mask_set(&info->rcv_intr);
-       init_cpu_mask_set(&info->proc);
-
         local_mask = cpumask_of_node(dd->node);
         if (cpumask_first(local_mask) >= nr_cpu_ids)
                 local_mask = topology_core_cpumask(0);
-       /* Use the "real" cpu mask of this node as the default */
-       cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);
-
-       /*  fill in the receive list */
-       possible = cpumask_weight(&info->def_intr.mask);
-       curr_cpu = cpumask_first(&info->def_intr.mask);
-       if (possible == 1) {
-               /*  only one CPU, everyone will use it */
-               cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-       } else {
-               /*
-                * Retain the first CPU in the default list for the control
-                * context.
-                */
-               curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-               /*
-                * Remove the remaining kernel receive queues from
-                * the default list and add them to the receive list.
-                */
-               for (i = 0; i < dd->n_krcv_queues - 1; i++) {
-                       cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
-                       cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-                       curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-                       if (curr_cpu >= nr_cpu_ids)
-                               break;
+
+       spin_lock(&node_affinity.lock);
+       entry = node_affinity_lookup(dd->node);
+       spin_unlock(&node_affinity.lock);
+
+       /*
+        * If this is the first time this NUMA node's affinity is used,
+        * create an entry in the global affinity structure and initialize it.
+        */
+       if (!entry) {
+               entry = node_affinity_allocate(node);
+               if (!entry) {
+                       dd_dev_err(dd,
+                                  "Unable to allocate global affinity node\n");
+                       return -ENOMEM;
                 }
-       }
+               init_cpu_mask_set(&entry->def_intr);
+               init_cpu_mask_set(&entry->rcv_intr);
+               cpumask_clear(&entry->general_intr_mask);
+               /* Use the "real" cpu mask of this node as the default */
+               cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
+                           local_mask);
+
+               /* fill in the receive list */
+               possible = cpumask_weight(&entry->def_intr.mask);
+               curr_cpu = cpumask_first(&entry->def_intr.mask);
+
+               if (possible == 1) {
+                       /* only one CPU, everyone will use it */
+                       cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
+                       cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
+               } else {
+                       /*
+                        * The general/control context will be the first CPU in
+                        * the default list, so it is removed from the default
+                        * list and added to the general interrupt list.
+                        */
+                       cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
+                       cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
+                       curr_cpu = cpumask_next(curr_cpu,
+                                               &entry->def_intr.mask);
  
-       cpumask_copy(&info->proc.mask, cpu_online_mask);
-}
+                       /*
+                        * Remove the remaining kernel receive queues from
+                        * the default list and add them to the receive list.
+                        */
+                       for (i = 0;
+                            i < (dd->n_krcv_queues - 1) *
+                                 hfi1_per_node_cntr[dd->node];
+                            i++) {
+                               cpumask_clear_cpu(curr_cpu,
+                                                 &entry->def_intr.mask);
+                               cpumask_set_cpu(curr_cpu,
+                                               &entry->rcv_intr.mask);
+                               curr_cpu = cpumask_next(curr_cpu,
+                                                       &entry->def_intr.mask);
+                               if (curr_cpu >= nr_cpu_ids)
+                                       break;
+                       }
  
-void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
-{
-       kfree(dd->affinity);
+                       /*
+                        * If there ends up being 0 CPU cores leftover for SDMA
+                        * engines, use the same CPU cores as general/control
+                        * context.
+                        */
+                       if (cpumask_weight(&entry->def_intr.mask) == 0)
+                               cpumask_copy(&entry->def_intr.mask,
+                                            &entry->general_intr_mask);
+               }
+
+               spin_lock(&node_affinity.lock);
+               node_affinity_add_tail(entry);
+               spin_unlock(&node_affinity.lock);
+       }
+
+       return 0;
  }
  
  int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
  {
         int ret;
         cpumask_var_t diff;
-       struct cpu_mask_set *set;
+       struct hfi1_affinity_node *entry;
+       struct cpu_mask_set *set = NULL;
         struct sdma_engine *sde = NULL;
         struct hfi1_ctxtdata *rcd = NULL;
         char extra[64];
@@ -194,22 +329,25 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
         if (!ret)
                 return -ENOMEM;
  
+       spin_lock(&node_affinity.lock);
+       entry = node_affinity_lookup(dd->node);
+       spin_unlock(&node_affinity.lock);
+
         switch (msix->type) {
         case IRQ_SDMA:
                 sde = (struct sdma_engine *)msix->arg;
                 scnprintf(extra, 64, "engine %u", sde->this_idx);
-               /* fall through */
+               set = &entry->def_intr;
+               break;
         case IRQ_GENERAL:
-               set = &dd->affinity->def_intr;
+               cpu = cpumask_first(&entry->general_intr_mask);
                 break;
         case IRQ_RCVCTXT:
                 rcd = (struct hfi1_ctxtdata *)msix->arg;
-               if (rcd->ctxt == HFI1_CTRL_CTXT) {
-                       set = &dd->affinity->def_intr;
-                       cpu = cpumask_first(&set->mask);
-               } else {
-                       set = &dd->affinity->rcv_intr;
-               }
+               if (rcd->ctxt == HFI1_CTRL_CTXT)
+                       cpu = cpumask_first(&entry->general_intr_mask);
+               else
+                       set = &entry->rcv_intr;
                 scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
                 break;
         default:
@@ -218,12 +356,12 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
         }
  
         /*
-        * The control receive context is placed on a particular CPU, which
-        * is set above.  Skip accounting for it.  Everything else finds its
-        * CPU here.
+        * The general and control contexts are placed on a particular
+        * CPU, which is set above. Skip accounting for it. Everything else
+        * finds its CPU here.
          */
-       if (cpu == -1) {
-               spin_lock(&dd->affinity->lock);
+       if (cpu == -1 && set) {
+               spin_lock(&node_affinity.lock);
                 if (cpumask_equal(&set->mask, &set->used)) {
                         /*
                          * We've used up all the CPUs, bump up the generation
@@ -235,7 +373,7 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
                 cpumask_andnot(diff, &set->mask, &set->used);
                 cpu = cpumask_first(diff);
                 cpumask_set_cpu(cpu, &set->used);
-               spin_unlock(&dd->affinity->lock);
+               spin_unlock(&node_affinity.lock);
         }
  
         switch (msix->type) {
@@ -263,43 +401,84 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
  {
         struct cpu_mask_set *set = NULL;
         struct hfi1_ctxtdata *rcd;
+       struct hfi1_affinity_node *entry;
+
+       spin_lock(&node_affinity.lock);
+       entry = node_affinity_lookup(dd->node);
+       spin_unlock(&node_affinity.lock);
  
         switch (msix->type) {
         case IRQ_SDMA:
+               set = &entry->def_intr;
+               break;
         case IRQ_GENERAL:
-               set = &dd->affinity->def_intr;
+               /* Don't do accounting for general contexts */
                 break;
         case IRQ_RCVCTXT:
                 rcd = (struct hfi1_ctxtdata *)msix->arg;
-               /* only do accounting for non control contexts */
+               /* Don't do accounting for control contexts */
                 if (rcd->ctxt != HFI1_CTRL_CTXT)
-                       set = &dd->affinity->rcv_intr;
+                       set = &entry->rcv_intr;
                 break;
         default:
                 return;
         }
  
         if (set) {
-               spin_lock(&dd->affinity->lock);
+               spin_lock(&node_affinity.lock);
                 cpumask_andnot(&set->used, &set->used, &msix->mask);
                 if (cpumask_empty(&set->used) && set->gen) {
                         set->gen--;
                         cpumask_copy(&set->used, &set->mask);
                 }
-               spin_unlock(&dd->affinity->lock);
+               spin_unlock(&node_affinity.lock);
         }
  
         irq_set_affinity_hint(msix->msix.vector, NULL);
         cpumask_clear(&msix->mask);
  }
  
-int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
+/* This should be called with node_affinity.lock held */
+static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
+                               struct hfi1_affinity_node_list *affinity)
  {
-       int cpu = -1, ret;
-       cpumask_var_t diff, mask, intrs;
+       int possible, curr_cpu, i;
+       uint num_cores_per_socket = node_affinity.num_online_cpus /
+                                       affinity->num_core_siblings /
+                                               node_affinity.num_online_nodes;
+
+       cpumask_copy(hw_thread_mask, &affinity->proc.mask);
+       if (affinity->num_core_siblings > 0) {
+               /* Removing other siblings not needed for now */
+               possible = cpumask_weight(hw_thread_mask);
+               curr_cpu = cpumask_first(hw_thread_mask);
+               for (i = 0;
+                    i < num_cores_per_socket * node_affinity.num_online_nodes;
+                    i++)
+                       curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
+
+               for (; i < possible; i++) {
+                       cpumask_clear_cpu(curr_cpu, hw_thread_mask);
+                       curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
+               }
+
+               /* Identifying correct HW threads within physical cores */
+               cpumask_shift_left(hw_thread_mask, hw_thread_mask,
+                                  num_cores_per_socket *
+                                  node_affinity.num_online_nodes *
+                                  hw_thread_no);
+       }
+}
+
+int hfi1_get_proc_affinity(int node)
+{
+       int cpu = -1, ret, i;
+       struct hfi1_affinity_node *entry;
+       cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
         const struct cpumask *node_mask,
                 *proc_mask = tsk_cpus_allowed(current);
-       struct cpu_mask_set *set = &dd->affinity->proc;
+       struct hfi1_affinity_node_list *affinity = &node_affinity;
+       struct cpu_mask_set *set = &affinity->proc;
  
         /*
          * check whether process/context affinity has already
@@ -325,22 +504,41 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
  
         /*
          * The process does not have a preset CPU affinity so find one to
-        * recommend. We prefer CPUs on the same NUMA as the device.
+        * recommend using the following algorithm:
+        *
+        * For each user process that is opening a context on HFI Y:
+        *  a) If all cores are filled, reinitialize the bitmask
+        *  b) Fill real cores first, then HT cores (First set of HT
+        *     cores on all physical cores, then second set of HT core,
+        *     and, so on) in the following order:
+        *
+        *     1. Same NUMA node as HFI Y and not running an IRQ
+        *        handler
+        *     2. Same NUMA node as HFI Y and running an IRQ handler
+        *     3. Different NUMA node to HFI Y and not running an IRQ
+        *        handler
+        *     4. Different NUMA node to HFI Y and running an IRQ
+        *        handler
+        *  c) Mark core as filled in the bitmask. As user processes are
+        *     done, clear cores from the bitmask.
          */
  
         ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
         if (!ret)
                 goto done;
-       ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
+       ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
         if (!ret)
                 goto free_diff;
-       ret = zalloc_cpumask_var(&intrs, GFP_KERNEL);
+       ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
+       if (!ret)
+               goto free_hw_thread_mask;
+       ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
         if (!ret)
-               goto free_mask;
+               goto free_available_mask;
  
-       spin_lock(&dd->affinity->lock);
+       spin_lock(&affinity->lock);
         /*
-        * If we've used all available CPUs, clear the mask and start
+        * If we've used all available HW threads, clear the mask and start
          * overloading.
          */
         if (cpumask_equal(&set->mask, &set->used)) {
@@ -348,81 +546,198 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
                 cpumask_clear(&set->used);
         }
  
-       /* CPUs used by interrupt handlers */
-       cpumask_copy(intrs, (dd->affinity->def_intr.gen ?
-                            &dd->affinity->def_intr.mask :
-                            &dd->affinity->def_intr.used));
-       cpumask_or(intrs, intrs, (dd->affinity->rcv_intr.gen ?
-                                 &dd->affinity->rcv_intr.mask :
-                                 &dd->affinity->rcv_intr.used));
+       /*
+        * If NUMA node has CPUs used by interrupt handlers, include them in the
+        * interrupt handler mask.
+        */
+       entry = node_affinity_lookup(node);
+       if (entry) {
+               cpumask_copy(intrs_mask, (entry->def_intr.gen ?
+                                         &entry->def_intr.mask :
+                                         &entry->def_intr.used));
+               cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
+                                                   &entry->rcv_intr.mask :
+                                                   &entry->rcv_intr.used));
+               cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
+       }
         hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
-                 cpumask_pr_args(intrs));
+                 cpumask_pr_args(intrs_mask));
+
+       cpumask_copy(hw_thread_mask, &set->mask);
  
         /*
-        * If we don't have a NUMA node requested, preference is towards
-        * device NUMA node
+        * If HT cores are enabled, identify which HW threads within the
+        * physical cores should be used.
          */
-       if (node == -1)
-               node = dd->node;
+       if (affinity->num_core_siblings > 0) {
+               for (i = 0; i < affinity->num_core_siblings; i++) {
+                       find_hw_thread_mask(i, hw_thread_mask, affinity);
+
+                       /*
+                        * If there's at least one available core for this HW
+                        * thread number, stop looking for a core.
+                        *
+                        * diff will always be not empty at least once in this
+                        * loop as the used mask gets reset when
+                        * (set->mask == set->used) before this loop.
+                        */
+                       cpumask_andnot(diff, hw_thread_mask, &set->used);
+                       if (!cpumask_empty(diff))
+                               break;
+               }
+       }
+       hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
+                 cpumask_pr_args(hw_thread_mask));
+
         node_mask = cpumask_of_node(node);
-       hfi1_cdbg(PROC, "device on NUMA %u, CPUs %*pbl", node,
+       hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
                   cpumask_pr_args(node_mask));
  
-       /* diff will hold all unused cpus */
-       cpumask_andnot(diff, &set->mask, &set->used);
-       hfi1_cdbg(PROC, "unused CPUs (all) %*pbl", cpumask_pr_args(diff));
-
-       /* get cpumask of available CPUs on preferred NUMA */
-       cpumask_and(mask, diff, node_mask);
-       hfi1_cdbg(PROC, "available cpus on NUMA %*pbl", cpumask_pr_args(mask));
+       /* Get cpumask of available CPUs on preferred NUMA */
+       cpumask_and(available_mask, hw_thread_mask, node_mask);
+       cpumask_andnot(available_mask, available_mask, &set->used);
+       hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
+                 cpumask_pr_args(available_mask));
  
         /*
          * At first, we don't want to place processes on the same
-        * CPUs as interrupt handlers.
+        * CPUs as interrupt handlers. Then, CPUs running interrupt
+        * handlers are used.
+        *
+        * 1) If diff is not empty, then there are CPUs not running
+        *    non-interrupt handlers available, so diff gets copied
+        *    over to available_mask.
+        * 2) If diff is empty, then all CPUs not running interrupt
+        *    handlers are taken, so available_mask contains all
+        *    available CPUs running interrupt handlers.
+        * 3) If available_mask is empty, then all CPUs on the
+        *    preferred NUMA node are taken, so other NUMA nodes are
+        *    used for process assignments using the same method as
+        *    the preferred NUMA node.
          */
-       cpumask_andnot(diff, mask, intrs);
+       cpumask_andnot(diff, available_mask, intrs_mask);
         if (!cpumask_empty(diff))
-               cpumask_copy(mask, diff);
+               cpumask_copy(available_mask, diff);
  
-       /*
-        * if we don't have a cpu on the preferred NUMA, get
-        * the list of the remaining available CPUs
-        */
-       if (cpumask_empty(mask)) {
-               cpumask_andnot(diff, &set->mask, &set->used);
-               cpumask_andnot(mask, diff, node_mask);
+       /* If we don't have CPUs on the preferred node, use other NUMA nodes */
+       if (cpumask_empty(available_mask)) {
+               cpumask_andnot(available_mask, hw_thread_mask, &set->used);
+               /* Excluding preferred NUMA cores */
+               cpumask_andnot(available_mask, available_mask, node_mask);
+               hfi1_cdbg(PROC,
+                         "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
+                         cpumask_pr_args(available_mask));
+
+               /*
+                * At first, we don't want to place processes on the same
+                * CPUs as interrupt handlers.
+                */
+               cpumask_andnot(diff, available_mask, intrs_mask);
+               if (!cpumask_empty(diff))
+                       cpumask_copy(available_mask, diff);
         }
-       hfi1_cdbg(PROC, "possible CPUs for process %*pbl",
-                 cpumask_pr_args(mask));
+       hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
+                 cpumask_pr_args(available_mask));
  
-       cpu = cpumask_first(mask);
+       cpu = cpumask_first(available_mask);
         if (cpu >= nr_cpu_ids) /* empty */
                 cpu = -1;
         else
                 cpumask_set_cpu(cpu, &set->used);
-       spin_unlock(&dd->affinity->lock);
-
-       free_cpumask_var(intrs);
-free_mask:
-       free_cpumask_var(mask);
+       spin_unlock(&affinity->lock);
+       hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
+
+       free_cpumask_var(intrs_mask);
+free_available_mask:
+       free_cpumask_var(available_mask);
+free_hw_thread_mask:
+       free_cpumask_var(hw_thread_mask);
  free_diff:
         free_cpumask_var(diff);
  done:
         return cpu;
  }
  
-void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu)
+void hfi1_put_proc_affinity(int cpu)
  {
-       struct cpu_mask_set *set = &dd->affinity->proc;
+       struct hfi1_affinity_node_list *affinity = &node_affinity;
+       struct cpu_mask_set *set = &affinity->proc;
  
         if (cpu < 0)
                 return;
-       spin_lock(&dd->affinity->lock);
+       spin_lock(&affinity->lock);
         cpumask_clear_cpu(cpu, &set->used);
+       hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
         if (cpumask_empty(&set->used) && set->gen) {
                 set->gen--;
                 cpumask_copy(&set->used, &set->mask);
         }
-       spin_unlock(&dd->affinity->lock);
+       spin_unlock(&affinity->lock);
  }
  
+/* Prevents concurrent reads and writes of the sdma_affinity attrib */
+static DEFINE_MUTEX(sdma_affinity_mutex);
+
+int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf,
+                          size_t count)
+{
+       struct hfi1_affinity_node *entry;
+       struct cpumask mask;
+       int ret, i;
+
+       spin_lock(&node_affinity.lock);
+       entry = node_affinity_lookup(dd->node);
+       spin_unlock(&node_affinity.lock);
+
+       if (!entry)
+               return -EINVAL;
+
+       ret = cpulist_parse(buf, &mask);
+       if (ret)
+               return ret;
+
+       if (!cpumask_subset(&mask, cpu_online_mask) || cpumask_empty(&mask)) {
+               dd_dev_warn(dd, "Invalid CPU mask\n");
+               return -EINVAL;
+       }
+
+       mutex_lock(&sdma_affinity_mutex);
+       /* reset the SDMA interrupt affinity details */
+       init_cpu_mask_set(&entry->def_intr);
+       cpumask_copy(&entry->def_intr.mask, &mask);
+       /*
+        * Reassign the affinity for each SDMA interrupt.
+        */
+       for (i = 0; i < dd->num_msix_entries; i++) {
+               struct hfi1_msix_entry *msix;
+
+               msix = &dd->msix_entries[i];
+               if (msix->type != IRQ_SDMA)
+                       continue;
+
+               ret = hfi1_get_irq_affinity(dd, msix);
+
+               if (ret)
+                       break;
+       }
+
+       mutex_unlock(&sdma_affinity_mutex);
+       return ret ? ret : strnlen(buf, PAGE_SIZE);
+}
+
+int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf)
+{
+       struct hfi1_affinity_node *entry;
+
+       spin_lock(&node_affinity.lock);
+       entry = node_affinity_lookup(dd->node);
+       spin_unlock(&node_affinity.lock);
+
+       if (!entry)
+               return -EINVAL;
+
+       mutex_lock(&sdma_affinity_mutex);
+       cpumap_print_to_pagebuf(true, buf, &entry->def_intr.mask);
+       mutex_unlock(&sdma_affinity_mutex);
+       return strnlen(buf, PAGE_SIZE);
+}
diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h

index 20f52fe7409161cb772b90aae96f8e4c7a012e05..8879cf7a8cac54fede679918f835c2a180a5e962 100644 (file)
--- a/drivers/infiniband/hw/hfi1/affinity.h
+++ b/drivers/infiniband/hw/hfi1/affinity.h
@@ -73,7 +73,6 @@ struct cpu_mask_set {
  struct hfi1_affinity {
         struct cpu_mask_set def_intr;
         struct cpu_mask_set rcv_intr;
-       struct cpu_mask_set proc;
         struct cpumask real_cpu_mask;
         /* spin lock to protect affinity struct */
         spinlock_t lock;
@@ -82,11 +81,9 @@ struct hfi1_affinity {
  struct hfi1_msix_entry;
  
  /* Initialize non-HT cpu cores mask */
-int init_real_cpu_mask(struct hfi1_devdata *);
+void init_real_cpu_mask(void);
  /* Initialize driver affinity data */
-void hfi1_dev_affinity_init(struct hfi1_devdata *);
-/* Free driver affinity data */
-void hfi1_dev_affinity_free(struct hfi1_devdata *);
+int hfi1_dev_affinity_init(struct hfi1_devdata *);
  /*
   * Set IRQ affinity to a CPU. The function will determine the
   * CPU and set the affinity to it.
@@ -101,8 +98,35 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
   * Determine a CPU affinity for a user process, if the process does not
   * have an affinity set yet.
   */
-int hfi1_get_proc_affinity(struct hfi1_devdata *, int);
+int hfi1_get_proc_affinity(int);
  /* Release a CPU used by a user process. */
-void hfi1_put_proc_affinity(struct hfi1_devdata *, int);
+void hfi1_put_proc_affinity(int);
+
+int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf);
+int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf,
+                          size_t count);
+
+struct hfi1_affinity_node {
+       int node;
+       struct cpu_mask_set def_intr;
+       struct cpu_mask_set rcv_intr;
+       struct cpumask general_intr_mask;
+       struct list_head list;
+};
+
+struct hfi1_affinity_node_list {
+       struct list_head list;
+       struct cpumask real_cpu_mask;
+       struct cpu_mask_set proc;
+       int num_core_siblings;
+       int num_online_nodes;
+       int num_online_cpus;
+       /* protect affinity node list */
+       spinlock_t lock;
+};
+
+int node_affinity_init(void);
+void node_affinity_destroy(void);
+extern struct hfi1_affinity_node_list node_affinity;
  
  #endif /* _HFI1_AFFINITY_H */
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c

index dad4d0ebbdffb45e2c667cc95b86aabf39b53a87..b32638d58ae82c73f920afee4a7fd0a4386581a8 100644 (file)
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -63,6 +63,7 @@
  #include "efivar.h"
  #include "platform.h"
  #include "aspm.h"
+#include "affinity.h"
  
  #define NUM_IB_PORTS 1
  
@@ -121,6 +122,7 @@ struct flag_table {
  #define SEC_SC_HALTED          0x4     /* per-context only */
  #define SEC_SPC_FREEZE         0x8     /* per-HFI only */
  
+#define DEFAULT_KRCVQS           2
  #define MIN_KERNEL_KCTXTS         2
  #define FIRST_KERNEL_KCTXT        1
  /* sizes for both the QP and RSM map tables */
@@ -238,6 +240,9 @@ struct flag_table {
  /* all CceStatus sub-block RXE pause bits */
  #define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK
  
+#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
+#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
+
  /*
   * CCE Error flags.
   */
@@ -3947,6 +3952,28 @@ static u64 access_sdma_wrong_dw_err_cnt(const struct cntr_entry *entry,
         return dd->sw_send_dma_eng_err_status_cnt[0];
  }
  
+static u64 access_dc_rcv_err_cnt(const struct cntr_entry *entry,
+                                void *context, int vl, int mode,
+                                u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       u64 val = 0;
+       u64 csr = entry->csr;
+
+       val = read_write_csr(dd, csr, mode, data);
+       if (mode == CNTR_MODE_R) {
+               val = val > CNTR_MAX - dd->sw_rcv_bypass_packet_errors ?
+                       CNTR_MAX : val + dd->sw_rcv_bypass_packet_errors;
+       } else if (mode == CNTR_MODE_W) {
+               dd->sw_rcv_bypass_packet_errors = 0;
+       } else {
+               dd_dev_err(dd, "Invalid cntr register access mode");
+               return 0;
+       }
+       return val;
+}
+
  #define def_access_sw_cpu(cntr) \
  static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry,                      \
                               void *context, int vl, int mode, u64 data)      \
@@ -4020,7 +4047,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
                         CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL),
  [C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT,
                               CNTR_SYNTH),
-[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH),
+[C_DC_RCV_ERR] = CNTR_ELEM("DcRecvErr", DCC_ERR_PORTRCV_ERR_CNT, 0, CNTR_SYNTH,
+                           access_dc_rcv_err_cnt),
  [C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT,
                                  CNTR_SYNTH),
  [C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT,
@@ -8798,30 +8826,6 @@ static int write_tx_settings(struct hfi1_devdata *dd,
         return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
  }
  
-static void check_fabric_firmware_versions(struct hfi1_devdata *dd)
-{
-       u32 frame, version, prod_id;
-       int ret, lane;
-
-       /* 4 lanes */
-       for (lane = 0; lane < 4; lane++) {
-               ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame);
-               if (ret) {
-                       dd_dev_err(dd,
-                                  "Unable to read lane %d firmware details\n",
-                                  lane);
-                       continue;
-               }
-               version = (frame >> SPICO_ROM_VERSION_SHIFT)
-                                       & SPICO_ROM_VERSION_MASK;
-               prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT)
-                                       & SPICO_ROM_PROD_ID_MASK;
-               dd_dev_info(dd,
-                           "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n",
-                           lane, version, prod_id);
-       }
-}
-
  /*
   * Read an idle LCB message.
   *
@@ -9187,17 +9191,24 @@ static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
         unsigned long timeout;
  
         /*
-        * Check for QSFP interrupt for t_init (SFF 8679)
+        * Some QSFP cables have a quirk that asserts the IntN line as a side
+        * effect of power up on plug-in. We ignore this false positive
+        * interrupt until the module has finished powering up by waiting for
+        * a minimum timeout of the module inrush initialization time of
+        * 500 ms (SFF 8679 Table 5-6) to ensure the voltage rails in the
+        * module have stabilized.
+        */
+       msleep(500);
+
+       /*
+        * Check for QSFP interrupt for t_init (SFF 8679 Table 8-1)
          */
         timeout = jiffies + msecs_to_jiffies(2000);
         while (1) {
                 mask = read_csr(dd, dd->hfi1_id ?
                                 ASIC_QSFP2_IN : ASIC_QSFP1_IN);
-               if (!(mask & QSFP_HFI0_INT_N)) {
-                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR :
-                                 ASIC_QSFP1_CLEAR, QSFP_HFI0_INT_N);
+               if (!(mask & QSFP_HFI0_INT_N))
                         break;
-               }
                 if (time_after(jiffies, timeout)) {
                         dd_dev_info(dd, "%s: No IntN detected, reset complete\n",
                                     __func__);
@@ -9213,10 +9224,17 @@ static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable)
         u64 mask;
  
         mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK);
-       if (enable)
+       if (enable) {
+               /*
+                * Clear the status register to avoid an immediate interrupt
+                * when we re-enable the IntN pin
+                */
+               write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR,
+                         QSFP_HFI0_INT_N);
                 mask |= (u64)QSFP_HFI0_INT_N;
-       else
+       } else {
                 mask &= ~(u64)QSFP_HFI0_INT_N;
+       }
         write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask);
  }
  
@@ -9630,14 +9648,6 @@ void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
                 hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
  }
  
-int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
-                       struct hfi1_ctxt_info *kinfo)
-{
-       kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) |
-               HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U);
-       return 0;
-}
-
  struct hfi1_message_header *hfi1_get_msgheader(
                                 struct hfi1_devdata *dd, __le32 *rhf_addr)
  {
@@ -9890,6 +9900,131 @@ static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs)
         return 0;
  }
  
+static const char *state_completed_string(u32 completed)
+{
+       static const char * const state_completed[] = {
+               "EstablishComm",
+               "OptimizeEQ",
+               "VerifyCap"
+       };
+
+       if (completed < ARRAY_SIZE(state_completed))
+               return state_completed[completed];
+
+       return "unknown";
+}
+
+static const char all_lanes_dead_timeout_expired[] =
+       "All lanes were inactive – was the interconnect media removed?";
+static const char tx_out_of_policy[] =
+       "Passing lanes on local port do not meet the local link width policy";
+static const char no_state_complete[] =
+       "State timeout occurred before link partner completed the state";
+static const char * const state_complete_reasons[] = {
+       [0x00] = "Reason unknown",
+       [0x01] = "Link was halted by driver, refer to LinkDownReason",
+       [0x02] = "Link partner reported failure",
+       [0x10] = "Unable to achieve frame sync on any lane",
+       [0x11] =
+         "Unable to find a common bit rate with the link partner",
+       [0x12] =
+         "Unable to achieve frame sync on sufficient lanes to meet the local link width policy",
+       [0x13] =
+         "Unable to identify preset equalization on sufficient lanes to meet the local link width policy",
+       [0x14] = no_state_complete,
+       [0x15] =
+         "State timeout occurred before link partner identified equalization presets",
+       [0x16] =
+         "Link partner completed the EstablishComm state, but the passing lanes do not meet the local link width policy",
+       [0x17] = tx_out_of_policy,
+       [0x20] = all_lanes_dead_timeout_expired,
+       [0x21] =
+         "Unable to achieve acceptable BER on sufficient lanes to meet the local link width policy",
+       [0x22] = no_state_complete,
+       [0x23] =
+         "Link partner completed the OptimizeEq state, but the passing lanes do not meet the local link width policy",
+       [0x24] = tx_out_of_policy,
+       [0x30] = all_lanes_dead_timeout_expired,
+       [0x31] =
+         "State timeout occurred waiting for host to process received frames",
+       [0x32] = no_state_complete,
+       [0x33] =
+         "Link partner completed the VerifyCap state, but the passing lanes do not meet the local link width policy",
+       [0x34] = tx_out_of_policy,
+};
+
+static const char *state_complete_reason_code_string(struct hfi1_pportdata *ppd,
+                                                    u32 code)
+{
+       const char *str = NULL;
+
+       if (code < ARRAY_SIZE(state_complete_reasons))
+               str = state_complete_reasons[code];
+
+       if (str)
+               return str;
+       return "Reserved";
+}
+
+/* describe the given last state complete frame */
+static void decode_state_complete(struct hfi1_pportdata *ppd, u32 frame,
+                                 const char *prefix)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 success;
+       u32 state;
+       u32 reason;
+       u32 lanes;
+
+       /*
+        * Decode frame:
+        *  [ 0: 0] - success
+        *  [ 3: 1] - state
+        *  [ 7: 4] - next state timeout
+        *  [15: 8] - reason code
+        *  [31:16] - lanes
+        */
+       success = frame & 0x1;
+       state = (frame >> 1) & 0x7;
+       reason = (frame >> 8) & 0xff;
+       lanes = (frame >> 16) & 0xffff;
+
+       dd_dev_err(dd, "Last %s LNI state complete frame 0x%08x:\n",
+                  prefix, frame);
+       dd_dev_err(dd, "    last reported state state: %s (0x%x)\n",
+                  state_completed_string(state), state);
+       dd_dev_err(dd, "    state successfully completed: %s\n",
+                  success ? "yes" : "no");
+       dd_dev_err(dd, "    fail reason 0x%x: %s\n",
+                  reason, state_complete_reason_code_string(ppd, reason));
+       dd_dev_err(dd, "    passing lane mask: 0x%x", lanes);
+}
+
+/*
+ * Read the last state complete frames and explain them.  This routine
+ * expects to be called if the link went down during link negotiation
+ * and initialization (LNI).  That is, anywhere between polling and link up.
+ */
+static void check_lni_states(struct hfi1_pportdata *ppd)
+{
+       u32 last_local_state;
+       u32 last_remote_state;
+
+       read_last_local_state(ppd->dd, &last_local_state);
+       read_last_remote_state(ppd->dd, &last_remote_state);
+
+       /*
+        * Don't report anything if there is nothing to report.  A value of
+        * 0 means the link was taken down while polling and there was no
+        * training in-process.
+        */
+       if (last_local_state == 0 && last_remote_state == 0)
+               return;
+
+       decode_state_complete(ppd, last_local_state, "transmitted");
+       decode_state_complete(ppd, last_remote_state, "received");
+}
+
  /*
   * Helper for set_link_state().  Do not call except from that routine.
   * Expects ppd->hls_mutex to be held.
@@ -9902,8 +10037,6 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
  {
         struct hfi1_devdata *dd = ppd->dd;
         u32 pstate, previous_state;
-       u32 last_local_state;
-       u32 last_remote_state;
         int ret;
         int do_transition;
         int do_wait;
@@ -10003,12 +10136,7 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
         } else if (previous_state
                         & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
                 /* went down while attempting link up */
-               /* byte 1 of last_*_state is the failure reason */
-               read_last_local_state(dd, &last_local_state);
-               read_last_remote_state(dd, &last_remote_state);
-               dd_dev_err(dd,
-                          "LNI failure last states: local 0x%08x, remote 0x%08x\n",
-                          last_local_state, last_remote_state);
+               check_lni_states(ppd);
         }
  
         /* the active link width (downgrade) is 0 on link down */
@@ -11668,9 +11796,6 @@ static void free_cntrs(struct hfi1_devdata *dd)
         dd->cntrnames = NULL;
  }
  
-#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
-#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
-
  static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
                               u64 *psval, void *context, int vl)
  {
@@ -12325,37 +12450,6 @@ u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
         return ib_pstate;
  }
  
-/*
- * Read/modify/write ASIC_QSFP register bits as selected by mask
- * data: 0 or 1 in the positions depending on what needs to be written
- * dir: 0 for read, 1 for write
- * mask: select by setting
- *      I2CCLK  (bit 0)
- *      I2CDATA (bit 1)
- */
-u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
-                 u32 mask)
-{
-       u64 qsfp_oe, target_oe;
-
-       target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
-       if (mask) {
-               /* We are writing register bits, so lock access */
-               dir &= mask;
-               data &= mask;
-
-               qsfp_oe = read_csr(dd, target_oe);
-               qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir;
-               write_csr(dd, target_oe, qsfp_oe);
-       }
-       /* We are exclusively reading bits here, but it is unlikely
-        * we'll get valid data when we set the direction of the pin
-        * in the same call, so read should call this function again
-        * to get valid data
-        */
-       return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
-}
-
  #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
  (r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
  
@@ -12780,7 +12874,6 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
  
         /*
          * Kernel receive contexts:
-        * - min of 2 or 1 context/numa (excluding control context)
          * - Context 0 - control context (VL15/multicast/error)
          * - Context 1 - first kernel context
          * - Context 2 - second kernel context
@@ -12794,9 +12887,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
                  */
                 num_kernel_contexts = n_krcvqs + 1;
         else
-               num_kernel_contexts = num_online_nodes() + 1;
-       num_kernel_contexts =
-               max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts);
+               num_kernel_contexts = DEFAULT_KRCVQS + 1;
         /*
          * Every kernel receive context needs an ACK send context.
          * one send context is allocated for each VL{0-7} and VL15
@@ -12815,7 +12906,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
          */
         if (num_user_contexts < 0)
                 num_user_contexts =
-                       cpumask_weight(&dd->affinity->real_cpu_mask);
+                       cpumask_weight(&node_affinity.real_cpu_mask);
  
         total_contexts = num_kernel_contexts + num_user_contexts;
  
@@ -14141,6 +14232,11 @@ static int init_asic_data(struct hfi1_devdata *dd)
         }
         dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */
         spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+       /* first one through - set up i2c devices */
+       if (!peer)
+               ret = set_up_i2c(dd, dd->asic_data);
+
         return ret;
  }
  
@@ -14445,19 +14541,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
                  (dd->revision >> CCE_REVISION_SW_SHIFT)
                     & CCE_REVISION_SW_MASK);
  
-       /*
-        * The real cpu mask is part of the affinity struct but has to be
-        * initialized earlier than the rest of the affinity struct because it
-        * is needed to calculate the number of user contexts in
-        * set_up_context_variables(). However, hfi1_dev_affinity_init(),
-        * which initializes the rest of the affinity struct members,
-        * depends on set_up_context_variables() for the number of kernel
-        * contexts, so it cannot be called before set_up_context_variables().
-        */
-       ret = init_real_cpu_mask(dd);
-       if (ret)
-               goto bail_cleanup;
-
         ret = set_up_context_variables(dd);
         if (ret)
                 goto bail_cleanup;
@@ -14471,7 +14554,9 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
         /* set up KDETH QP prefix in both RX and TX CSRs */
         init_kdeth_qp(dd);
  
-       hfi1_dev_affinity_init(dd);
+       ret = hfi1_dev_affinity_init(dd);
+       if (ret)
+               goto bail_cleanup;
  
         /* send contexts must be set up before receive contexts */
         ret = init_send_contexts(dd);
@@ -14508,8 +14593,14 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
         /* set up LCB access - must be after set_up_interrupts() */
         init_lcb_access(dd);
  
+       /*
+        * Serial number is created from the base guid:
+        * [27:24] = base guid [38:35]
+        * [23: 0] = base guid [23: 0]
+        */
         snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
-                dd->base_guid & 0xFFFFFF);
+                (dd->base_guid & 0xFFFFFF) |
+                    ((dd->base_guid >> 11) & 0xF000000));
  
         dd->oui1 = dd->base_guid >> 56 & 0xFF;
         dd->oui2 = dd->base_guid >> 48 & 0xFF;
@@ -14518,7 +14609,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
         ret = load_firmware(dd); /* asymmetric with dispose_firmware() */
         if (ret)
                 goto bail_clear_intr;
-       check_fabric_firmware_versions(dd);
  
         thermal_init(dd);
  
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h

index 66a327978739dd0b33e82146b60cc26c069e8703..ed11107c50fe50614efc39b469a31cb7e240291c 100644 (file)
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -640,6 +640,7 @@ extern uint platform_config_load;
  /* SBus commands */
  #define RESET_SBUS_RECEIVER 0x20
  #define WRITE_SBUS_RECEIVER 0x21
+#define READ_SBUS_RECEIVER  0x22
  void sbus_request(struct hfi1_devdata *dd,
                   u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
  int sbus_request_slow(struct hfi1_devdata *dd,
@@ -1336,10 +1337,6 @@ void hfi1_start_cleanup(struct hfi1_devdata *dd);
  void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
  struct hfi1_message_header *hfi1_get_msgheader(
                                 struct hfi1_devdata *dd, __le32 *rhf_addr);
-int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
-                       struct hfi1_ctxt_info *kinfo);
-u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
-                 u32 mask);
  int hfi1_init_ctxt(struct send_context *sc);
  void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
                   u32 type, unsigned long pa, u16 order);
diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h

index 8744de6667c25fbe22c37f84c247540744886c4b..5b999389978946003b84f202c4f715e0a0ade242 100644 (file)
--- a/drivers/infiniband/hw/hfi1/chip_registers.h
+++ b/drivers/infiniband/hw/hfi1/chip_registers.h
@@ -471,6 +471,10 @@
  #define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010)
  #define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull
  #define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull
+#define ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT 2
+#define ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK 0x7ull
+#define ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT 32
+#define ASIC_STS_SBUS_RESULT_DATA_OUT_MASK 0xFFFFFFFFull
  #define ASIC_STS_THERM (ASIC + 0x000000000058)
  #define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull
  #define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c

index c75b0ae688f87713a922be6df18ab19efb67e56b..8246dc7d0573a1be768fb7d65fab2fba01c7c686 100644 (file)
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -392,9 +392,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
                         u16 rlid;
                         u8 svc_type, sl, sc5;
  
-                       sc5  = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf;
-                       if (rhf_dc_info(packet->rhf))
-                               sc5 |= 0x10;
+                       sc5 = hdr2sc(rhdr, packet->rhf);
                         sl = ibp->sc_to_sl[sc5];
  
                         lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK;
@@ -450,14 +448,20 @@ static inline void init_packet(struct hfi1_ctxtdata *rcd,
         packet->rcv_flags = 0;
  }
  
-static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr,
-                       struct hfi1_other_headers *ohdr,
-                       u64 rhf, u32 bth1, struct ib_grh *grh)
+void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+                              bool do_cnp)
  {
         struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       u32 rqpn = 0;
-       u16 rlid;
-       u8 sc5, svc_type;
+       struct hfi1_ib_header *hdr = pkt->hdr;
+       struct hfi1_other_headers *ohdr = pkt->ohdr;
+       struct ib_grh *grh = NULL;
+       u32 rqpn = 0, bth1;
+       u16 rlid, dlid = be16_to_cpu(hdr->lrh[1]);
+       u8 sc, svc_type;
+       bool is_mcast = false;
+
+       if (pkt->rcv_flags & HFI1_HAS_GRH)
+               grh = &hdr->u.l.grh;
  
         switch (qp->ibqp.qp_type) {
         case IB_QPT_SMI:
@@ -466,6 +470,8 @@ static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr,
                 rlid = be16_to_cpu(hdr->lrh[3]);
                 rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
                 svc_type = IB_CC_SVCTYPE_UD;
+               is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+                       (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
                 break;
         case IB_QPT_UC:
                 rlid = qp->remote_ah_attr.dlid;
@@ -481,24 +487,23 @@ static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr,
                 return;
         }
  
-       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-       if (rhf_dc_info(rhf))
-               sc5 |= 0x10;
+       sc = hdr2sc((struct hfi1_message_header *)hdr, pkt->rhf);
  
-       if (bth1 & HFI1_FECN_SMASK) {
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       if (do_cnp && (bth1 & HFI1_FECN_SMASK)) {
                 u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
-               u16 dlid = be16_to_cpu(hdr->lrh[1]);
  
-               return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc5, grh);
+               return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc, grh);
         }
  
-       if (bth1 & HFI1_BECN_SMASK) {
+       if (!is_mcast && (bth1 & HFI1_BECN_SMASK)) {
                 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
                 u32 lqpn = bth1 & RVT_QPN_MASK;
-               u8 sl = ibp->sc_to_sl[sc5];
+               u8 sl = ibp->sc_to_sl[sc];
  
                 process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
         }
+
  }
  
  struct ps_mdata {
@@ -596,7 +601,6 @@ static void __prescan_rxq(struct hfi1_packet *packet)
                 struct rvt_qp *qp;
                 struct hfi1_ib_header *hdr;
                 struct hfi1_other_headers *ohdr;
-               struct ib_grh *grh = NULL;
                 struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
                 u64 rhf = rhf_to_cpu(rhf_addr);
                 u32 etype = rhf_rcv_type(rhf), qpn, bth1;
@@ -616,14 +620,13 @@ static void __prescan_rxq(struct hfi1_packet *packet)
                         hfi1_get_msgheader(dd, rhf_addr);
                 lnh = be16_to_cpu(hdr->lrh[0]) & 3;
  
-               if (lnh == HFI1_LRH_BTH) {
+               if (lnh == HFI1_LRH_BTH)
                         ohdr = &hdr->u.oth;
-               } else if (lnh == HFI1_LRH_GRH) {
+               else if (lnh == HFI1_LRH_GRH)
                         ohdr = &hdr->u.l.oth;
-                       grh = &hdr->u.l.grh;
-               } else {
+               else
                         goto next; /* just in case */
-               }
+
                 bth1 = be32_to_cpu(ohdr->bth[1]);
                 is_ecn = !!(bth1 & (HFI1_FECN_SMASK | HFI1_BECN_SMASK));
  
@@ -639,7 +642,7 @@ static void __prescan_rxq(struct hfi1_packet *packet)
                         goto next;
                 }
  
-               process_ecn(qp, hdr, ohdr, rhf, bth1, grh);
+               process_ecn(qp, packet, true);
                 rcu_read_unlock();
  
                 /* turn off BECN, FECN */
@@ -1362,6 +1365,7 @@ int process_receive_bypass(struct hfi1_packet *packet)
  
         dd_dev_err(packet->rcd->dd,
                    "Bypass packets are not supported in normal operation. Dropping\n");
+       incr_cntr64(&packet->rcd->dd->sw_rcv_bypass_packet_errors);
         return RHF_RCV_CONTINUE;
  }
  
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c

index c702a009608f27a62b06b7ad4eb2d54914fab03b..1ecbec1923589c3ec96d0958ec45541aa2b5724b 100644 (file)
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -168,6 +168,7 @@ static inline int is_valid_mmap(u64 token)
  
  static int hfi1_file_open(struct inode *inode, struct file *fp)
  {
+       struct hfi1_filedata *fd;
         struct hfi1_devdata *dd = container_of(inode->i_cdev,
                                                struct hfi1_devdata,
                                                user_cdev);
@@ -176,10 +177,17 @@ static int hfi1_file_open(struct inode *inode, struct file *fp)
         kobject_get(&dd->kobj);
  
         /* The real work is performed later in assign_ctxt() */
-       fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL);
-       if (fp->private_data) /* no cpu affinity by default */
-               ((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1;
-       return fp->private_data ? 0 : -ENOMEM;
+
+       fd = kzalloc(sizeof(*fd), GFP_KERNEL);
+
+       if (fd) {
+               fd->rec_cpu_num = -1; /* no cpu affinity by default */
+               fd->mm = current->mm;
+       }
+
+       fp->private_data = fd;
+
+       return fd ? 0 : -ENOMEM;
  }
  
  static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
@@ -228,7 +236,7 @@ static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
                                     sizeof(struct hfi1_base_info));
                 break;
         case HFI1_IOCTL_CREDIT_UPD:
-               if (uctxt && uctxt->sc)
+               if (uctxt)
                         sc_return_credits(uctxt->sc);
                 break;
  
@@ -392,41 +400,38 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
         struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
         struct hfi1_user_sdma_pkt_q *pq = fd->pq;
         struct hfi1_user_sdma_comp_q *cq = fd->cq;
-       int ret = 0, done = 0, reqs = 0;
+       int done = 0, reqs = 0;
         unsigned long dim = from->nr_segs;
  
-       if (!cq || !pq) {
-               ret = -EIO;
-               goto done;
-       }
+       if (!cq || !pq)
+               return -EIO;
  
-       if (!iter_is_iovec(from) || !dim) {
-               ret = -EINVAL;
-               goto done;
-       }
+       if (!iter_is_iovec(from) || !dim)
+               return -EINVAL;
  
         hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
                   fd->uctxt->ctxt, fd->subctxt, dim);
  
-       if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) {
-               ret = -ENOSPC;
-               goto done;
-       }
+       if (atomic_read(&pq->n_reqs) == pq->n_max_reqs)
+               return -ENOSPC;
  
         while (dim) {
+               int ret;
                 unsigned long count = 0;
  
                 ret = hfi1_user_sdma_process_request(
                         kiocb->ki_filp, (struct iovec *)(from->iov + done),
                         dim, &count);
-               if (ret)
-                       goto done;
+               if (ret) {
+                       reqs = ret;
+                       break;
+               }
                 dim -= count;
                 done += count;
                 reqs++;
         }
-done:
-       return ret ? ret : reqs;
+
+       return reqs;
  }
  
  static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
@@ -718,7 +723,7 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
         hfi1_user_sdma_free_queues(fdata);
  
         /* release the cpu */
-       hfi1_put_proc_affinity(dd, fdata->rec_cpu_num);
+       hfi1_put_proc_affinity(fdata->rec_cpu_num);
  
         /*
          * Clear any left over, unhandled events so the next process that
@@ -730,7 +735,6 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
  
         if (--uctxt->cnt) {
                 uctxt->active_slaves &= ~(1 << fdata->subctxt);
-               uctxt->subpid[fdata->subctxt] = 0;
                 mutex_unlock(&hfi1_mutex);
                 goto done;
         }
@@ -756,7 +760,6 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
         write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
                         hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
         sc_disable(uctxt->sc);
-       uctxt->pid = 0;
         spin_unlock_irqrestore(&dd->uctxt_lock, flags);
  
         dd->rcd[uctxt->ctxt] = NULL;
@@ -818,9 +821,10 @@ static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo)
                 ret = find_shared_ctxt(fp, uinfo);
                 if (ret < 0)
                         goto done_unlock;
-               if (ret)
-                       fd->rec_cpu_num = hfi1_get_proc_affinity(
-                               fd->uctxt->dd, fd->uctxt->numa_id);
+               if (ret) {
+                       fd->rec_cpu_num =
+                               hfi1_get_proc_affinity(fd->uctxt->numa_id);
+               }
         }
  
         /*
@@ -895,7 +899,6 @@ static int find_shared_ctxt(struct file *fp,
                         }
                         fd->uctxt = uctxt;
                         fd->subctxt  = uctxt->cnt++;
-                       uctxt->subpid[fd->subctxt] = current->pid;
                         uctxt->active_slaves |= 1 << fd->subctxt;
                         ret = 1;
                         goto done;
@@ -932,7 +935,11 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
         if (ctxt == dd->num_rcv_contexts)
                 return -EBUSY;
  
-       fd->rec_cpu_num = hfi1_get_proc_affinity(dd, -1);
+       /*
+        * If we don't have a NUMA node requested, preference is towards
+        * device NUMA node.
+        */
+       fd->rec_cpu_num = hfi1_get_proc_affinity(dd->node);
         if (fd->rec_cpu_num != -1)
                 numa = cpu_to_node(fd->rec_cpu_num);
         else
@@ -976,8 +983,7 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
                         return ret;
         }
         uctxt->userversion = uinfo->userversion;
-       uctxt->pid = current->pid;
-       uctxt->flags = HFI1_CAP_UGET(MASK);
+       uctxt->flags = hfi1_cap_mask; /* save current flag state */
         init_waitqueue_head(&uctxt->wait);
         strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
         memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
@@ -1080,18 +1086,18 @@ static int user_init(struct file *fp)
         hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
  
         rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP))
+       if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP))
                 rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
         /*
          * Ignore the bit in the flags for now until proper
          * support for multiple packet per rcv array entry is
          * added.
          */
-       if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
+       if (!HFI1_CAP_UGET_MASK(uctxt->flags, MULTI_PKT_EGR))
                 rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
+       if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_EGR_FULL))
                 rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
+       if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
                 rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
         /*
          * The RcvCtxtCtrl.TailUpd bit has to be explicitly written.
@@ -1099,7 +1105,7 @@ static int user_init(struct file *fp)
          * uses of the chip or ctxt. Therefore, add the rcvctrl op
          * for both cases.
          */
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
+       if (HFI1_CAP_UGET_MASK(uctxt->flags, DMA_RTAIL))
                 rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
         else
                 rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
@@ -1122,9 +1128,14 @@ static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
         int ret = 0;
  
         memset(&cinfo, 0, sizeof(cinfo));
-       ret = hfi1_get_base_kinfo(uctxt, &cinfo);
-       if (ret < 0)
-               goto done;
+       cinfo.runtime_flags = (((uctxt->flags >> HFI1_CAP_MISC_SHIFT) &
+                               HFI1_CAP_MISC_MASK) << HFI1_CAP_USER_SHIFT) |
+                       HFI1_CAP_UGET_MASK(uctxt->flags, MASK) |
+                       HFI1_CAP_KGET_MASK(uctxt->flags, K2U);
+       /* adjust flag if this fd is not able to cache */
+       if (!fd->handler)
+               cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */
+
         cinfo.num_active = hfi1_count_active_units();
         cinfo.unit = uctxt->dd->unit;
         cinfo.ctxt = uctxt->ctxt;
@@ -1146,7 +1157,7 @@ static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
         trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo);
         if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
                 ret = -EFAULT;
-done:
+
         return ret;
  }
  
diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c

index ed680fda611dfdcee292e2613ee6e2d13fdfd891..13db8eb4f4ece1949e699c9e8b509a4fa70598a0 100644 (file)
--- a/drivers/infiniband/hw/hfi1/firmware.c
+++ b/drivers/infiniband/hw/hfi1/firmware.c
@@ -206,6 +206,9 @@ static const struct firmware *platform_config;
  /* the number of fabric SerDes on the SBus */
  #define NUM_FABRIC_SERDES 4
  
+/* ASIC_STS_SBUS_RESULT.RESULT_CODE value */
+#define SBUS_READ_COMPLETE 0x4
+
  /* SBus fabric SerDes addresses, one set per HFI */
  static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = {
         { 0x01, 0x02, 0x03, 0x04 },
@@ -240,6 +243,7 @@ static const u8 all_pcie_serdes_broadcast = 0xe0;
  static void dispose_one_firmware(struct firmware_details *fdet);
  static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
                                        struct firmware_details *fdet);
+static void dump_fw_version(struct hfi1_devdata *dd);
  
  /*
   * Read a single 64-bit value from 8051 data memory.
@@ -1078,6 +1082,44 @@ void sbus_request(struct hfi1_devdata *dd,
                    ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT));
  }
  
+/*
+ * Read a value from the SBus.
+ *
+ * Requires the caller to be in fast mode
+ */
+static u32 sbus_read(struct hfi1_devdata *dd, u8 receiver_addr, u8 data_addr,
+                    u32 data_in)
+{
+       u64 reg;
+       int retries;
+       int success = 0;
+       u32 result = 0;
+       u32 result_code = 0;
+
+       sbus_request(dd, receiver_addr, data_addr, READ_SBUS_RECEIVER, data_in);
+
+       for (retries = 0; retries < 100; retries++) {
+               usleep_range(1000, 1200); /* arbitrary */
+               reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+               result_code = (reg >> ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT)
+                               & ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK;
+               if (result_code != SBUS_READ_COMPLETE)
+                       continue;
+
+               success = 1;
+               result = (reg >> ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT)
+                          & ASIC_STS_SBUS_RESULT_DATA_OUT_MASK;
+               break;
+       }
+
+       if (!success) {
+               dd_dev_err(dd, "%s: read failed, result code 0x%x\n", __func__,
+                          result_code);
+       }
+
+       return result;
+}
+
  /*
   * Turn off the SBus and fabric serdes spicos.
   *
@@ -1636,6 +1678,7 @@ int load_firmware(struct hfi1_devdata *dd)
                         return ret;
         }
  
+       dump_fw_version(dd);
         return 0;
  }
  
@@ -2054,3 +2097,85 @@ void read_guid(struct hfi1_devdata *dd)
         dd_dev_info(dd, "GUID %llx",
                     (unsigned long long)dd->base_guid);
  }
+
+/* read and display firmware version info */
+static void dump_fw_version(struct hfi1_devdata *dd)
+{
+       u32 pcie_vers[NUM_PCIE_SERDES];
+       u32 fabric_vers[NUM_FABRIC_SERDES];
+       u32 sbus_vers;
+       int i;
+       int all_same;
+       int ret;
+       u8 rcv_addr;
+
+       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+       if (ret) {
+               dd_dev_err(dd, "Unable to acquire SBus to read firmware versions\n");
+               return;
+       }
+
+       /* set fast mode */
+       set_sbus_fast_mode(dd);
+
+       /* read version for SBus Master */
+       sbus_request(dd, SBUS_MASTER_BROADCAST, 0x02, WRITE_SBUS_RECEIVER, 0);
+       sbus_request(dd, SBUS_MASTER_BROADCAST, 0x07, WRITE_SBUS_RECEIVER, 0x1);
+       /* wait for interrupt to be processed */
+       usleep_range(10000, 11000);
+       sbus_vers = sbus_read(dd, SBUS_MASTER_BROADCAST, 0x08, 0x1);
+       dd_dev_info(dd, "SBus Master firmware version 0x%08x\n", sbus_vers);
+
+       /* read version for PCIe SerDes */
+       all_same = 1;
+       pcie_vers[0] = 0;
+       for (i = 0; i < NUM_PCIE_SERDES; i++) {
+               rcv_addr = pcie_serdes_addrs[dd->hfi1_id][i];
+               sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0);
+               /* wait for interrupt to be processed */
+               usleep_range(10000, 11000);
+               pcie_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0);
+               if (i > 0 && pcie_vers[0] != pcie_vers[i])
+                       all_same = 0;
+       }
+
+       if (all_same) {
+               dd_dev_info(dd, "PCIe SerDes firmware version 0x%x\n",
+                           pcie_vers[0]);
+       } else {
+               dd_dev_warn(dd, "PCIe SerDes do not have the same firmware version\n");
+               for (i = 0; i < NUM_PCIE_SERDES; i++) {
+                       dd_dev_info(dd,
+                                   "PCIe SerDes lane %d firmware version 0x%x\n",
+                                   i, pcie_vers[i]);
+               }
+       }
+
+       /* read version for fabric SerDes */
+       all_same = 1;
+       fabric_vers[0] = 0;
+       for (i = 0; i < NUM_FABRIC_SERDES; i++) {
+               rcv_addr = fabric_serdes_addrs[dd->hfi1_id][i];
+               sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0);
+               /* wait for interrupt to be processed */
+               usleep_range(10000, 11000);
+               fabric_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0);
+               if (i > 0 && fabric_vers[0] != fabric_vers[i])
+                       all_same = 0;
+       }
+
+       if (all_same) {
+               dd_dev_info(dd, "Fabric SerDes firmware version 0x%x\n",
+                           fabric_vers[0]);
+       } else {
+               dd_dev_warn(dd, "Fabric SerDes do not have the same firmware version\n");
+               for (i = 0; i < NUM_FABRIC_SERDES; i++) {
+                       dd_dev_info(dd,
+                                   "Fabric SerDes lane %d firmware version 0x%x\n",
+                                   i, fabric_vers[i]);
+               }
+       }
+
+       clear_sbus_fast_mode(dd);
+       release_chip_resource(dd, CR_SBUS);
+}
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h

index 4417a0fd3ef9032bb6336f4afbb30b1241b3727c..1000e0fd96d9b4972cec3327739f5ca7ff59b7b3 100644 (file)
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -62,6 +62,8 @@
  #include <linux/cdev.h>
  #include <linux/delay.h>
  #include <linux/kthread.h>
+#include <linux/i2c.h>
+#include <linux/i2c-algo-bit.h>
  #include <rdma/rdma_vt.h>
  
  #include "chip_registers.h"
@@ -253,7 +255,7 @@ struct hfi1_ctxtdata {
         /* chip offset of PIO buffers for this ctxt */
         u32 piobufs;
         /* per-context configuration flags */
-       u32 flags;
+       unsigned long flags;
         /* per-context event flags for fileops/intr communication */
         unsigned long event_flags;
         /* WAIT_RCV that timed out, no interrupt */
@@ -268,9 +270,6 @@ struct hfi1_ctxtdata {
         u32 urgent;
         /* saved total number of polled urgent packets for poll edge trigger */
         u32 urgent_poll;
-       /* pid of process using this ctxt */
-       pid_t pid;
-       pid_t subpid[HFI1_MAX_SHARED_CTXTS];
         /* same size as task_struct .comm[], command that opened context */
         char comm[TASK_COMM_LEN];
         /* so file ops can get at unit */
@@ -366,11 +365,6 @@ struct hfi1_packet {
         u8 etype;
  };
  
-static inline bool has_sc4_bit(struct hfi1_packet *p)
-{
-       return !!rhf_dc_info(p->rhf);
-}
-
  /*
   * Private data for snoop/capture support.
   */
@@ -805,10 +799,19 @@ struct hfi1_temp {
         u8 triggers;      /* temperature triggers */
  };
  
+struct hfi1_i2c_bus {
+       struct hfi1_devdata *controlling_dd; /* current controlling device */
+       struct i2c_adapter adapter;     /* bus details */
+       struct i2c_algo_bit_data algo;  /* bus algorithm details */
+       int num;                        /* bus number, 0 or 1 */
+};
+
  /* common data between shared ASIC HFIs */
  struct hfi1_asic_data {
         struct hfi1_devdata *dds[2];    /* back pointers */
         struct mutex asic_resource_mutex;
+       struct hfi1_i2c_bus *i2c_bus0;
+       struct hfi1_i2c_bus *i2c_bus1;
  };
  
  /* device data struct now contains only "general per-device" info.
@@ -1128,7 +1131,8 @@ struct hfi1_devdata {
                 NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS];
         /* Software counter that aggregates all cce_err_status errors */
         u64 sw_cce_err_status_aggregate;
-
+       /* Software counter that aggregates all bypass packet rcv errors */
+       u64 sw_rcv_bypass_packet_errors;
         /* receive interrupt functions */
         rhf_rcv_function_ptr *rhf_rcv_function_map;
         rhf_rcv_function_ptr normal_rhf_rcv_functions[8];
@@ -1174,6 +1178,8 @@ struct hfi1_devdata {
  
  /* 8051 firmware version helper */
  #define dc8051_ver(a, b) ((a) << 8 | (b))
+#define dc8051_ver_maj(a) ((a & 0xff00) >> 8)
+#define dc8051_ver_min(a)  (a & 0x00ff)
  
  /* f_put_tid types */
  #define PT_EXPECTED 0
@@ -1182,6 +1188,7 @@ struct hfi1_devdata {
  
  struct tid_rb_node;
  struct mmu_rb_node;
+struct mmu_rb_handler;
  
  /* Private data for file operations */
  struct hfi1_filedata {
@@ -1192,7 +1199,7 @@ struct hfi1_filedata {
         /* for cpu affinity; -1 if none */
         int rec_cpu_num;
         u32 tid_n_pinned;
-       struct rb_root tid_rb_root;
+       struct mmu_rb_handler *handler;
         struct tid_rb_node **entry_to_rb;
         spinlock_t tid_lock; /* protect tid_[limit,used] counters */
         u32 tid_limit;
@@ -1201,6 +1208,7 @@ struct hfi1_filedata {
         u32 invalid_tid_idx;
         /* protect invalid_tids array and invalid_tid_idx */
         spinlock_t invalid_lock;
+       struct mm_struct *mm;
  };
  
  extern struct list_head hfi1_dev_list;
@@ -1234,6 +1242,8 @@ int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *, int);
  int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *, int);
  void set_all_slowpath(struct hfi1_devdata *dd);
  
+extern const struct pci_device_id hfi1_pci_tbl[];
+
  /* receive packet handler dispositions */
  #define RCV_PKT_OK      0x0 /* keep going */
  #define RCV_PKT_LIMIT   0x1 /* stop, hit limit, start thread */
@@ -1259,7 +1269,7 @@ void receive_interrupt_work(struct work_struct *work);
  static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf)
  {
         return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) |
-              ((!!(rhf & RHF_DC_INFO_SMASK)) << 4);
+              ((!!(rhf_dc_info(rhf))) << 4);
  }
  
  static inline u16 generate_jkey(kuid_t uid)
@@ -1569,6 +1579,22 @@ static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port)
         return &dd->pport[pidx].ibport_data;
  }
  
+void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+                              bool do_cnp);
+static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt,
+                              bool do_cnp)
+{
+       struct hfi1_other_headers *ohdr = pkt->ohdr;
+       u32 bth1;
+
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
+               hfi1_process_ecn_slowpath(qp, pkt, do_cnp);
+               return bth1 & HFI1_FECN_SMASK;
+       }
+       return false;
+}
+
  /*
   * Return the indexed PKEY from the port PKEY table.
   */
@@ -1586,14 +1612,23 @@ static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index)
  }
  
  /*
- * Readers of cc_state must call get_cc_state() under rcu_read_lock().
- * Writers of cc_state must call get_cc_state() under cc_state_lock.
+ * Called by readers of cc_state only, must call under rcu_read_lock().
   */
  static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd)
  {
         return rcu_dereference(ppd->cc_state);
  }
  
+/*
+ * Called by writers of cc_state only,  must call under cc_state_lock.
+ */
+static inline
+struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd)
+{
+       return rcu_dereference_protected(ppd->cc_state,
+                                        lockdep_is_held(&ppd->cc_state_lock));
+}
+
  /*
   * values for dd->flags (_device_ related flags)
   */
@@ -1669,9 +1704,12 @@ void shutdown_led_override(struct hfi1_pportdata *ppd);
   */
  #define DEFAULT_RCVHDR_ENTSIZE 32
  
-bool hfi1_can_pin_pages(struct hfi1_devdata *, u32, u32);
-int hfi1_acquire_user_pages(unsigned long, size_t, bool, struct page **);
-void hfi1_release_user_pages(struct mm_struct *, struct page **, size_t, bool);
+bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm,
+                       u32 nlocked, u32 npages);
+int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr,
+                           size_t npages, bool writable, struct page **pages);
+void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
+                            size_t npages, bool dirty);
  
  static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
  {
@@ -1947,4 +1985,55 @@ static inline u32 qsfp_resource(struct hfi1_devdata *dd)
  
  int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
  
+#define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
+#define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
+
+#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
+#define show_packettype(etype)                  \
+__print_symbolic(etype,                         \
+       packettype_name(EXPECTED),              \
+       packettype_name(EAGER),                 \
+       packettype_name(IB),                    \
+       packettype_name(ERROR),                 \
+       packettype_name(BYPASS))
+
+#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode  }
+#define show_ib_opcode(opcode)                             \
+__print_symbolic(opcode,                                   \
+       ib_opcode_name(RC_SEND_FIRST),                     \
+       ib_opcode_name(RC_SEND_MIDDLE),                    \
+       ib_opcode_name(RC_SEND_LAST),                      \
+       ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE),       \
+       ib_opcode_name(RC_SEND_ONLY),                      \
+       ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(RC_RDMA_WRITE_FIRST),               \
+       ib_opcode_name(RC_RDMA_WRITE_MIDDLE),              \
+       ib_opcode_name(RC_RDMA_WRITE_LAST),                \
+       ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+       ib_opcode_name(RC_RDMA_WRITE_ONLY),                \
+       ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+       ib_opcode_name(RC_RDMA_READ_REQUEST),              \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST),       \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE),      \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST),        \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY),        \
+       ib_opcode_name(RC_ACKNOWLEDGE),                    \
+       ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
+       ib_opcode_name(RC_COMPARE_SWAP),                   \
+       ib_opcode_name(RC_FETCH_ADD),                      \
+       ib_opcode_name(UC_SEND_FIRST),                     \
+       ib_opcode_name(UC_SEND_MIDDLE),                    \
+       ib_opcode_name(UC_SEND_LAST),                      \
+       ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE),       \
+       ib_opcode_name(UC_SEND_ONLY),                      \
+       ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(UC_RDMA_WRITE_FIRST),               \
+       ib_opcode_name(UC_RDMA_WRITE_MIDDLE),              \
+       ib_opcode_name(UC_RDMA_WRITE_LAST),                \
+       ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+       ib_opcode_name(UC_RDMA_WRITE_ONLY),                \
+       ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+       ib_opcode_name(UD_SEND_ONLY),                      \
+       ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(CNP))
  #endif                          /* _HFI1_KERNEL_H */
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c

index eed971ccd2a1e88e7180f2eae5aba92da5448c96..a358d23ecd54d563d05a14e10c0cd885f6546b8f 100644 (file)
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -64,6 +64,7 @@
  #include "debugfs.h"
  #include "verbs.h"
  #include "aspm.h"
+#include "affinity.h"
  
  #undef pr_fmt
  #define pr_fmt(fmt) DRIVER_NAME ": " fmt
@@ -474,8 +475,9 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
  void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
                          struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
  {
-       int i, size;
+       int i;
         uint default_pkey_idx;
+       struct cc_state *cc_state;
  
         ppd->dd = dd;
         ppd->hw_pidx = hw_pidx;
@@ -526,9 +528,9 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
  
         spin_lock_init(&ppd->cc_state_lock);
         spin_lock_init(&ppd->cc_log_lock);
-       size = sizeof(struct cc_state);
-       RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL));
-       if (!rcu_dereference(ppd->cc_state))
+       cc_state = kzalloc(sizeof(*cc_state), GFP_KERNEL);
+       RCU_INIT_POINTER(ppd->cc_state, cc_state);
+       if (!cc_state)
                 goto bail;
         return;
  
@@ -972,39 +974,49 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
  
  /*
   * Release our hold on the shared asic data.  If we are the last one,
- * free the structure.  Must be holding hfi1_devs_lock.
+ * return the structure to be finalized outside the lock.  Must be
+ * holding hfi1_devs_lock.
   */
-static void release_asic_data(struct hfi1_devdata *dd)
+static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd)
  {
+       struct hfi1_asic_data *ad;
         int other;
  
         if (!dd->asic_data)
-               return;
+               return NULL;
         dd->asic_data->dds[dd->hfi1_id] = NULL;
         other = dd->hfi1_id ? 0 : 1;
-       if (!dd->asic_data->dds[other]) {
-               /* we are the last holder, free it */
-               kfree(dd->asic_data);
-       }
+       ad = dd->asic_data;
         dd->asic_data = NULL;
+       /* return NULL if the other dd still has a link */
+       return ad->dds[other] ? NULL : ad;
+}
+
+static void finalize_asic_data(struct hfi1_devdata *dd,
+                              struct hfi1_asic_data *ad)
+{
+       clean_up_i2c(dd, ad);
+       kfree(ad);
  }
  
  static void __hfi1_free_devdata(struct kobject *kobj)
  {
         struct hfi1_devdata *dd =
                 container_of(kobj, struct hfi1_devdata, kobj);
+       struct hfi1_asic_data *ad;
         unsigned long flags;
  
         spin_lock_irqsave(&hfi1_devs_lock, flags);
         idr_remove(&hfi1_unit_table, dd->unit);
         list_del(&dd->list);
-       release_asic_data(dd);
+       ad = release_asic_data(dd);
         spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       if (ad)
+               finalize_asic_data(dd, ad);
         free_platform_config(dd);
         rcu_barrier(); /* wait for rcu callbacks to complete */
         free_percpu(dd->int_counter);
         free_percpu(dd->rcv_limit);
-       hfi1_dev_affinity_free(dd);
         free_percpu(dd->send_schedule);
         rvt_dealloc_device(&dd->verbs_dev.rdi);
  }
@@ -1162,7 +1174,7 @@ static int init_one(struct pci_dev *, const struct pci_device_id *);
  #define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
  #define PFX DRIVER_NAME ": "
  
-static const struct pci_device_id hfi1_pci_tbl[] = {
+const struct pci_device_id hfi1_pci_tbl[] = {
         { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
         { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
         { 0, }
@@ -1198,6 +1210,10 @@ static int __init hfi1_mod_init(void)
         if (ret)
                 goto bail;
  
+       ret = node_affinity_init();
+       if (ret)
+               goto bail;
+
         /* validate max MTU before any devices start */
         if (!valid_opa_max_mtu(hfi1_max_mtu)) {
                 pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
@@ -1278,6 +1294,7 @@ module_init(hfi1_mod_init);
  static void __exit hfi1_mod_cleanup(void)
  {
         pci_unregister_driver(&hfi1_pci_driver);
+       node_affinity_destroy();
         hfi1_wss_exit();
         hfi1_dbg_exit();
         hfi1_cpulist_count = 0;
@@ -1311,7 +1328,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
                         hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
  
                 spin_lock(&ppd->cc_state_lock);
-               cc_state = get_cc_state(ppd);
+               cc_state = get_cc_state_protected(ppd);
                 RCU_INIT_POINTER(ppd->cc_state, NULL);
                 spin_unlock(&ppd->cc_state_lock);
  
@@ -1760,8 +1777,8 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
  
         hfi1_cdbg(PROC,
                   "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
-                 rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size,
-                 rcd->egrbufs.size);
+                 rcd->ctxt, rcd->egrbufs.alloced,
+                 rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024);
  
         /*
          * Set the contexts rcv array head update threshold to the closest
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c

index fca07a1d6c284b90238543777e85462cc2249651..1263abe01999e3e84306b5594921bc4b11019fca 100644 (file)
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -588,7 +588,6 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
  
         pi->port_phys_conf = (ppd->port_type & 0xf);
  
-#if PI_LED_ENABLE_SUP
         pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
         pi->port_states.ledenable_offlinereason |=
                 ppd->is_sm_config_started << 5;
@@ -602,11 +601,6 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
         pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6;
         pi->port_states.ledenable_offlinereason |=
                 ppd->offline_disabled_reason;
-#else
-       pi->port_states.offline_reason = ppd->neighbor_normal << 4;
-       pi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
-       pi->port_states.offline_reason |= ppd->offline_disabled_reason;
-#endif /* PI_LED_ENABLE_SUP */
  
         pi->port_states.portphysstate_portstate =
                 (hfi1_ibphys_portstate(ppd) << 4) | state;
@@ -1752,17 +1746,11 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
         if (start_of_sm_config && (lstate == IB_PORT_INIT))
                 ppd->is_sm_config_started = 1;
  
-#if PI_LED_ENABLE_SUP
         psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
         psi->port_states.ledenable_offlinereason |=
                 ppd->is_sm_config_started << 5;
         psi->port_states.ledenable_offlinereason |=
                 ppd->offline_disabled_reason;
-#else
-       psi->port_states.offline_reason = ppd->neighbor_normal << 4;
-       psi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
-       psi->port_states.offline_reason |= ppd->offline_disabled_reason;
-#endif /* PI_LED_ENABLE_SUP */
  
         psi->port_states.portphysstate_portstate =
                 (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf);
@@ -2430,14 +2418,9 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
         rsp->port_rcv_remote_physical_errors =
                 cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
                                           CNTR_INVALID_VL));
-       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
-       if (tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->local_link_integrity_errors = cpu_to_be64(~0);
-       } else {
-               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
-       }
+       rsp->local_link_integrity_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY,
+                                         CNTR_INVALID_VL));
         tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
         tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
                                    CNTR_INVALID_VL);
@@ -2499,6 +2482,9 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
                         cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
                                                   idx_from_vl(vl)));
  
+               rsp->vls[vfi].port_vl_xmit_discards =
+                       cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
+                                                  idx_from_vl(vl)));
                 vlinfo++;
                 vfi++;
         }
@@ -2529,9 +2515,8 @@ static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port,
         error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
                                                CNTR_INVALID_VL);
         /* local link integrity must be right-shifted by the lli resolution */
-       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
-       tmp += read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
-       error_counter_summary += (tmp >> res_lli);
+       error_counter_summary += (read_dev_cntr(dd, C_DC_RX_REPLAY,
+                                               CNTR_INVALID_VL) >> res_lli);
         /* link error recovery must b right-shifted by the ler resolution */
         tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
         tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL);
@@ -2800,14 +2785,9 @@ static void pma_get_opa_port_ectrs(struct ib_device *ibdev,
         rsp->port_rcv_constraint_errors =
                 cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
                                            CNTR_INVALID_VL));
-       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
-       if (tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->local_link_integrity_errors = cpu_to_be64(~0);
-       } else {
-               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
-       }
+       rsp->local_link_integrity_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY,
+                                         CNTR_INVALID_VL));
         rsp->excessive_buffer_overruns =
                 cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
  }
@@ -2883,14 +2863,17 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
         tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
  
         rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
-
+       rsp->port_rcv_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
         vlinfo = &rsp->vls[0];
         vfi = 0;
         vl_select_mask = be32_to_cpu(req->vl_select_mask);
         for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
                          8 * sizeof(req->vl_select_mask)) {
                 memset(vlinfo, 0, sizeof(*vlinfo));
-               /* vlinfo->vls[vfi].port_vl_xmit_discards ??? */
+               rsp->vls[vfi].port_vl_xmit_discards =
+                       cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
+                                                  idx_from_vl(vl)));
                 vlinfo += 1;
                 vfi++;
         }
@@ -3162,10 +3145,8 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
         if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS)
                 write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0);
  
-       if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) {
-               write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
+       if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS)
                 write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
-       }
  
         if (counter_select & CS_LINK_ERROR_RECOVERY) {
                 write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
@@ -3223,7 +3204,9 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
                 /* if (counter_select & CS_PORT_MARK_FECN)
                  *     write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
                  */
-               /* port_vl_xmit_discards ??? */
+               if (counter_select & C_SW_XMIT_DSCD_VL)
+                       write_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
+                                       idx_from_vl(vl), 0);
         }
  
         if (resp_len)
@@ -3392,7 +3375,7 @@ static void apply_cc_state(struct hfi1_pportdata *ppd)
          */
         spin_lock(&ppd->cc_state_lock);
  
-       old_cc_state = get_cc_state(ppd);
+       old_cc_state = get_cc_state_protected(ppd);
         if (!old_cc_state) {
                 /* never active, or shutting down */
                 spin_unlock(&ppd->cc_state_lock);
@@ -3960,7 +3943,6 @@ void clear_linkup_counters(struct hfi1_devdata *dd)
         write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
         write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0);
         /* LocalLinkIntegrityErrors */
-       write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
         write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
         /* ExcessiveBufferOverruns */
         write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h

index 8b734aaae88adf48be52a1a2e3d17ca4f800b7ce..5aa3fd1be6538e6b37842c3cd734dedaf18f8dc0 100644 (file)
--- a/drivers/infiniband/hw/hfi1/mad.h
+++ b/drivers/infiniband/hw/hfi1/mad.h
@@ -48,15 +48,8 @@
  #define _HFI1_MAD_H
  
  #include <rdma/ib_pma.h>
-#define USE_PI_LED_ENABLE      1 /*
-                                  * use led enabled bit in struct
-                                  * opa_port_states, if available
-                                  */
  #include <rdma/opa_smi.h>
  #include <rdma/opa_port_info.h>
-#ifndef PI_LED_ENABLE_SUP
-#define PI_LED_ENABLE_SUP 0
-#endif
  #include "opa_compat.h"
  
  /*
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c

index b7a80aa1ae30601b3fdb67501ca7f5a4ba1d812e..7ad30898fc19de4cef022fb4eec70418ae6ee5ad 100644 (file)
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -53,19 +53,20 @@
  #include "trace.h"
  
  struct mmu_rb_handler {
-       struct list_head list;
         struct mmu_notifier mn;
-       struct rb_root *root;
+       struct rb_root root;
+       void *ops_arg;
         spinlock_t lock;        /* protect the RB tree */
         struct mmu_rb_ops *ops;
+       struct mm_struct *mm;
+       struct list_head lru_list;
+       struct work_struct del_work;
+       struct list_head del_list;
+       struct workqueue_struct *wq;
  };
  
-static LIST_HEAD(mmu_rb_handlers);
-static DEFINE_SPINLOCK(mmu_rb_lock); /* protect mmu_rb_handlers list */
-
  static unsigned long mmu_node_start(struct mmu_rb_node *);
  static unsigned long mmu_node_last(struct mmu_rb_node *);
-static struct mmu_rb_handler *find_mmu_handler(struct rb_root *);
  static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *,
                                      unsigned long);
  static inline void mmu_notifier_range_start(struct mmu_notifier *,
@@ -76,6 +77,9 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *,
                                         unsigned long, unsigned long);
  static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
                                            unsigned long, unsigned long);
+static void do_remove(struct mmu_rb_handler *handler,
+                     struct list_head *del_list);
+static void handle_remove(struct work_struct *work);
  
  static struct mmu_notifier_ops mn_opts = {
         .invalidate_page = mmu_notifier_page,
@@ -95,73 +99,79 @@ static unsigned long mmu_node_last(struct mmu_rb_node *node)
         return PAGE_ALIGN(node->addr + node->len) - 1;
  }
  
-int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops)
+int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
+                        struct mmu_rb_ops *ops,
+                        struct workqueue_struct *wq,
+                        struct mmu_rb_handler **handler)
  {
         struct mmu_rb_handler *handlr;
-
-       if (!ops->invalidate)
-               return -EINVAL;
+       int ret;
  
         handlr = kmalloc(sizeof(*handlr), GFP_KERNEL);
         if (!handlr)
                 return -ENOMEM;
  
-       handlr->root = root;
+       handlr->root = RB_ROOT;
         handlr->ops = ops;
+       handlr->ops_arg = ops_arg;
         INIT_HLIST_NODE(&handlr->mn.hlist);
         spin_lock_init(&handlr->lock);
         handlr->mn.ops = &mn_opts;
-       spin_lock(&mmu_rb_lock);
-       list_add_tail_rcu(&handlr->list, &mmu_rb_handlers);
-       spin_unlock(&mmu_rb_lock);
+       handlr->mm = mm;
+       INIT_WORK(&handlr->del_work, handle_remove);
+       INIT_LIST_HEAD(&handlr->del_list);
+       INIT_LIST_HEAD(&handlr->lru_list);
+       handlr->wq = wq;
+
+       ret = mmu_notifier_register(&handlr->mn, handlr->mm);
+       if (ret) {
+               kfree(handlr);
+               return ret;
+       }
  
-       return mmu_notifier_register(&handlr->mn, current->mm);
+       *handler = handlr;
+       return 0;
  }
  
-void hfi1_mmu_rb_unregister(struct rb_root *root)
+void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler)
  {
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
+       struct mmu_rb_node *rbnode;
+       struct rb_node *node;
         unsigned long flags;
-
-       if (!handler)
-               return;
+       struct list_head del_list;
  
         /* Unregister first so we don't get any more notifications. */
-       if (current->mm)
-               mmu_notifier_unregister(&handler->mn, current->mm);
+       mmu_notifier_unregister(&handler->mn, handler->mm);
  
-       spin_lock(&mmu_rb_lock);
-       list_del_rcu(&handler->list);
-       spin_unlock(&mmu_rb_lock);
-       synchronize_rcu();
+       /*
+        * Make sure the wq delete handler is finished running.  It will not
+        * be triggered once the mmu notifiers are unregistered above.
+        */
+       flush_work(&handler->del_work);
+
+       INIT_LIST_HEAD(&del_list);
  
         spin_lock_irqsave(&handler->lock, flags);
-       if (!RB_EMPTY_ROOT(root)) {
-               struct rb_node *node;
-               struct mmu_rb_node *rbnode;
-
-               while ((node = rb_first(root))) {
-                       rbnode = rb_entry(node, struct mmu_rb_node, node);
-                       rb_erase(node, root);
-                       if (handler->ops->remove)
-                               handler->ops->remove(root, rbnode, NULL);
-               }
+       while ((node = rb_first(&handler->root))) {
+               rbnode = rb_entry(node, struct mmu_rb_node, node);
+               rb_erase(node, &handler->root);
+               /* move from LRU list to delete list */
+               list_move(&rbnode->list, &del_list);
         }
         spin_unlock_irqrestore(&handler->lock, flags);
  
+       do_remove(handler, &del_list);
+
         kfree(handler);
  }
  
-int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
+int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
+                      struct mmu_rb_node *mnode)
  {
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
         struct mmu_rb_node *node;
         unsigned long flags;
         int ret = 0;
  
-       if (!handler)
-               return -EINVAL;
-
         spin_lock_irqsave(&handler->lock, flags);
         hfi1_cdbg(MMU, "Inserting node addr 0x%llx, len %u", mnode->addr,
                   mnode->len);
@@ -170,12 +180,13 @@ int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
                 ret = -EINVAL;
                 goto unlock;
         }
-       __mmu_int_rb_insert(mnode, root);
+       __mmu_int_rb_insert(mnode, &handler->root);
+       list_add(&mnode->list, &handler->lru_list);
  
-       if (handler->ops->insert) {
-               ret = handler->ops->insert(root, mnode);
-               if (ret)
-                       __mmu_int_rb_remove(mnode, root);
+       ret = handler->ops->insert(handler->ops_arg, mnode);
+       if (ret) {
+               __mmu_int_rb_remove(mnode, &handler->root);
+               list_del(&mnode->list); /* remove from LRU list */
         }
  unlock:
         spin_unlock_irqrestore(&handler->lock, flags);
@@ -191,10 +202,10 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
  
         hfi1_cdbg(MMU, "Searching for addr 0x%llx, len %u", addr, len);
         if (!handler->ops->filter) {
-               node = __mmu_int_rb_iter_first(handler->root, addr,
+               node = __mmu_int_rb_iter_first(&handler->root, addr,
                                                (addr + len) - 1);
         } else {
-               for (node = __mmu_int_rb_iter_first(handler->root, addr,
+               for (node = __mmu_int_rb_iter_first(&handler->root, addr,
                                                     (addr + len) - 1);
                      node;
                      node = __mmu_int_rb_iter_next(node, addr,
@@ -206,82 +217,72 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
         return node;
  }
  
-/* Caller must *not* hold handler lock. */
-static void __mmu_rb_remove(struct mmu_rb_handler *handler,
-                           struct mmu_rb_node *node, struct mm_struct *mm)
-{
-       unsigned long flags;
-
-       /* Validity of handler and node pointers has been checked by caller. */
-       hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr,
-                 node->len);
-       spin_lock_irqsave(&handler->lock, flags);
-       __mmu_int_rb_remove(node, handler->root);
-       spin_unlock_irqrestore(&handler->lock, flags);
-
-       if (handler->ops->remove)
-               handler->ops->remove(handler->root, node, mm);
-}
-
-struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr,
-                                      unsigned long len)
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler,
+                                       unsigned long addr, unsigned long len)
  {
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
         struct mmu_rb_node *node;
         unsigned long flags;
  
-       if (!handler)
-               return ERR_PTR(-EINVAL);
-
         spin_lock_irqsave(&handler->lock, flags);
         node = __mmu_rb_search(handler, addr, len);
+       if (node) {
+               __mmu_int_rb_remove(node, &handler->root);
+               list_del(&node->list); /* remove from LRU list */
+       }
         spin_unlock_irqrestore(&handler->lock, flags);
  
         return node;
  }
  
-struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root,
-                                       unsigned long addr, unsigned long len)
+void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg)
  {
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-       struct mmu_rb_node *node;
+       struct mmu_rb_node *rbnode, *ptr;
+       struct list_head del_list;
         unsigned long flags;
+       bool stop = false;
  
-       if (!handler)
-               return ERR_PTR(-EINVAL);
+       INIT_LIST_HEAD(&del_list);
  
         spin_lock_irqsave(&handler->lock, flags);
-       node = __mmu_rb_search(handler, addr, len);
-       if (node)
-               __mmu_int_rb_remove(node, handler->root);
+       list_for_each_entry_safe_reverse(rbnode, ptr, &handler->lru_list,
+                                        list) {
+               if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg,
+                                       &stop)) {
+                       __mmu_int_rb_remove(rbnode, &handler->root);
+                       /* move from LRU list to delete list */
+                       list_move(&rbnode->list, &del_list);
+               }
+               if (stop)
+                       break;
+       }
         spin_unlock_irqrestore(&handler->lock, flags);
  
-       return node;
+       while (!list_empty(&del_list)) {
+               rbnode = list_first_entry(&del_list, struct mmu_rb_node, list);
+               list_del(&rbnode->list);
+               handler->ops->remove(handler->ops_arg, rbnode);
+       }
  }
  
-void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node)
+/*
+ * It is up to the caller to ensure that this function does not race with the
+ * mmu invalidate notifier which may be calling the users remove callback on
+ * 'node'.
+ */
+void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
+                       struct mmu_rb_node *node)
  {
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-
-       if (!handler || !node)
-               return;
-
-       __mmu_rb_remove(handler, node, NULL);
-}
+       unsigned long flags;
  
-static struct mmu_rb_handler *find_mmu_handler(struct rb_root *root)
-{
-       struct mmu_rb_handler *handler;
+       /* Validity of handler and node pointers has been checked by caller. */
+       hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr,
+                 node->len);
+       spin_lock_irqsave(&handler->lock, flags);
+       __mmu_int_rb_remove(node, &handler->root);
+       list_del(&node->list); /* remove from LRU list */
+       spin_unlock_irqrestore(&handler->lock, flags);
  
-       rcu_read_lock();
-       list_for_each_entry_rcu(handler, &mmu_rb_handlers, list) {
-               if (handler->root == root)
-                       goto unlock;
-       }
-       handler = NULL;
-unlock:
-       rcu_read_unlock();
-       return handler;
+       handler->ops->remove(handler->ops_arg, node);
  }
  
  static inline void mmu_notifier_page(struct mmu_notifier *mn,
@@ -304,9 +305,10 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
  {
         struct mmu_rb_handler *handler =
                 container_of(mn, struct mmu_rb_handler, mn);
-       struct rb_root *root = handler->root;
+       struct rb_root *root = &handler->root;
         struct mmu_rb_node *node, *ptr = NULL;
         unsigned long flags;
+       bool added = false;
  
         spin_lock_irqsave(&handler->lock, flags);
         for (node = __mmu_int_rb_iter_first(root, start, end - 1);
@@ -315,11 +317,53 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
                 ptr = __mmu_int_rb_iter_next(node, start, end - 1);
                 hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u",
                           node->addr, node->len);
-               if (handler->ops->invalidate(root, node)) {
+               if (handler->ops->invalidate(handler->ops_arg, node)) {
                         __mmu_int_rb_remove(node, root);
-                       if (handler->ops->remove)
-                               handler->ops->remove(root, node, mm);
+                       /* move from LRU list to delete list */
+                       list_move(&node->list, &handler->del_list);
+                       added = true;
                 }
         }
         spin_unlock_irqrestore(&handler->lock, flags);
+
+       if (added)
+               queue_work(handler->wq, &handler->del_work);
+}
+
+/*
+ * Call the remove function for the given handler and the list.  This
+ * is expected to be called with a delete list extracted from handler.
+ * The caller should not be holding the handler lock.
+ */
+static void do_remove(struct mmu_rb_handler *handler,
+                     struct list_head *del_list)
+{
+       struct mmu_rb_node *node;
+
+       while (!list_empty(del_list)) {
+               node = list_first_entry(del_list, struct mmu_rb_node, list);
+               list_del(&node->list);
+               handler->ops->remove(handler->ops_arg, node);
+       }
+}
+
+/*
+ * Work queue function to remove all nodes that have been queued up to
+ * be removed.  The key feature is that mm->mmap_sem is not being held
+ * and the remove callback can sleep while taking it, if needed.
+ */
+static void handle_remove(struct work_struct *work)
+{
+       struct mmu_rb_handler *handler = container_of(work,
+                                               struct mmu_rb_handler,
+                                               del_work);
+       struct list_head del_list;
+       unsigned long flags;
+
+       /* remove anything that is queued to get removed */
+       spin_lock_irqsave(&handler->lock, flags);
+       list_replace_init(&handler->del_list, &del_list);
+       spin_unlock_irqrestore(&handler->lock, flags);
+
+       do_remove(handler, &del_list);
  }
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h

index 7a57b9c49d271fdce65f5fa46a7f03c7f96fd89d..754f6ebf13fb1ac61d42dee6f2a31c726d98d42e 100644 (file)
--- a/drivers/infiniband/hw/hfi1/mmu_rb.h
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.h
@@ -54,23 +54,34 @@ struct mmu_rb_node {
         unsigned long len;
         unsigned long __last;
         struct rb_node node;
+       struct list_head list;
  };
  
+/*
+ * NOTE: filter, insert, invalidate, and evict must not sleep.  Only remove is
+ * allowed to sleep.
+ */
  struct mmu_rb_ops {
-       bool (*filter)(struct mmu_rb_node *, unsigned long, unsigned long);
-       int (*insert)(struct rb_root *, struct mmu_rb_node *);
-       void (*remove)(struct rb_root *, struct mmu_rb_node *,
-                      struct mm_struct *);
-       int (*invalidate)(struct rb_root *, struct mmu_rb_node *);
+       bool (*filter)(struct mmu_rb_node *node, unsigned long addr,
+                      unsigned long len);
+       int (*insert)(void *ops_arg, struct mmu_rb_node *mnode);
+       void (*remove)(void *ops_arg, struct mmu_rb_node *mnode);
+       int (*invalidate)(void *ops_arg, struct mmu_rb_node *node);
+       int (*evict)(void *ops_arg, struct mmu_rb_node *mnode,
+                    void *evict_arg, bool *stop);
  };
  
-int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops);
-void hfi1_mmu_rb_unregister(struct rb_root *);
-int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
-void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *);
-struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long,
-                                      unsigned long);
-struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long,
-                                       unsigned long);
+int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
+                        struct mmu_rb_ops *ops,
+                        struct workqueue_struct *wq,
+                        struct mmu_rb_handler **handler);
+void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler);
+int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
+                      struct mmu_rb_node *mnode);
+void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg);
+void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
+                       struct mmu_rb_node *mnode);
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler,
+                                       unsigned long addr, unsigned long len);
  
  #endif /* _HFI1_MMU_RB_H */
diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c

index 0bac21e6a658ca242b856910e06be7a91cbf7284..89c68da1c273297c71476fe0acbe37d93f7f97a3 100644 (file)
--- a/drivers/infiniband/hw/hfi1/pcie.c
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -679,6 +679,10 @@ static uint pcie_pset = UNSET_PSET;
  module_param(pcie_pset, uint, S_IRUGO);
  MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10");
  
+static uint pcie_ctle = 1; /* discrete on, integrated off */
+module_param(pcie_ctle, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_ctle, "PCIe static CTLE mode, bit 0 - discrete on/off, bit 1 - integrated on/off");
+
  /* equalization columns */
  #define PREC 0
  #define ATTN 1
@@ -716,6 +720,36 @@ static const u8 integrated_preliminary_eq[11][3] = {
         {  0x00,  0x1e,  0x0a },        /* p10 */
  };
  
+static const u8 discrete_ctle_tunings[11][4] = {
+       /* DC     LF     HF     BW */
+       {  0x48,  0x0b,  0x04,  0x04 }, /* p0 */
+       {  0x60,  0x05,  0x0f,  0x0a }, /* p1 */
+       {  0x50,  0x09,  0x06,  0x06 }, /* p2 */
+       {  0x68,  0x05,  0x0f,  0x0a }, /* p3 */
+       {  0x80,  0x05,  0x0f,  0x0a }, /* p4 */
+       {  0x70,  0x05,  0x0f,  0x0a }, /* p5 */
+       {  0x68,  0x05,  0x0f,  0x0a }, /* p6 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p7 */
+       {  0x48,  0x09,  0x06,  0x06 }, /* p8 */
+       {  0x60,  0x05,  0x0f,  0x0a }, /* p9 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p10 */
+};
+
+static const u8 integrated_ctle_tunings[11][4] = {
+       /* DC     LF     HF     BW */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p0 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p1 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p2 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p3 */
+       {  0x58,  0x0a,  0x05,  0x05 }, /* p4 */
+       {  0x48,  0x0a,  0x05,  0x05 }, /* p5 */
+       {  0x40,  0x0a,  0x05,  0x05 }, /* p6 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p7 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p8 */
+       {  0x38,  0x09,  0x06,  0x06 }, /* p9 */
+       {  0x38,  0x0e,  0x01,  0x01 }, /* p10 */
+};
+
  /* helper to format the value to write to hardware */
  #define eq_value(pre, curr, post) \
         ((((u32)(pre)) << \
@@ -951,11 +985,14 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd)
         u32 status, err;
         int ret;
         int do_retry, retry_count = 0;
+       int intnum = 0;
         uint default_pset;
         u16 target_vector, target_speed;
         u16 lnkctl2, vendor;
         u8 div;
         const u8 (*eq)[3];
+       const u8 (*ctle_tunings)[4];
+       uint static_ctle_mode;
         int return_error = 0;
  
         /* PCIe Gen3 is for the ASIC only */
@@ -1089,6 +1126,9 @@ retry:
                 div = 3;
                 eq = discrete_preliminary_eq;
                 default_pset = DEFAULT_DISCRETE_PSET;
+               ctle_tunings = discrete_ctle_tunings;
+               /* bit 0 - discrete on/off */
+               static_ctle_mode = pcie_ctle & 0x1;
         } else {
                 /* 400mV, FS=29, LF = 9 */
                 fs = 29;
@@ -1096,6 +1136,9 @@ retry:
                 div = 1;
                 eq = integrated_preliminary_eq;
                 default_pset = DEFAULT_MCP_PSET;
+               ctle_tunings = integrated_ctle_tunings;
+               /* bit 1 - integrated on/off */
+               static_ctle_mode = (pcie_ctle >> 1) & 0x1;
         }
         pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101,
                                (fs <<
@@ -1135,16 +1178,33 @@ retry:
          * step 5c: Program gasket interrupts
          */
         /* set the Rx Bit Rate to REFCLK ratio */
-       write_gasket_interrupt(dd, 0, 0x0006, 0x0050);
+       write_gasket_interrupt(dd, intnum++, 0x0006, 0x0050);
         /* disable pCal for PCIe Gen3 RX equalization */
-       write_gasket_interrupt(dd, 1, 0x0026, 0x5b01);
+       /* select adaptive or static CTLE */
+       write_gasket_interrupt(dd, intnum++, 0x0026,
+                              0x5b01 | (static_ctle_mode << 3));
         /*
          * Enable iCal for PCIe Gen3 RX equalization, and set which
          * evaluation of RX_EQ_EVAL will launch the iCal procedure.
          */
-       write_gasket_interrupt(dd, 2, 0x0026, 0x5202);
+       write_gasket_interrupt(dd, intnum++, 0x0026, 0x5202);
+
+       if (static_ctle_mode) {
+               /* apply static CTLE tunings */
+               u8 pcie_dc, pcie_lf, pcie_hf, pcie_bw;
+
+               pcie_dc = ctle_tunings[pcie_pset][0];
+               pcie_lf = ctle_tunings[pcie_pset][1];
+               pcie_hf = ctle_tunings[pcie_pset][2];
+               pcie_bw = ctle_tunings[pcie_pset][3];
+               write_gasket_interrupt(dd, intnum++, 0x0026, 0x0200 | pcie_dc);
+               write_gasket_interrupt(dd, intnum++, 0x0026, 0x0100 | pcie_lf);
+               write_gasket_interrupt(dd, intnum++, 0x0026, 0x0000 | pcie_hf);
+               write_gasket_interrupt(dd, intnum++, 0x0026, 0x5500 | pcie_bw);
+       }
+
         /* terminate list */
-       write_gasket_interrupt(dd, 3, 0x0000, 0x0000);
+       write_gasket_interrupt(dd, intnum++, 0x0000, 0x0000);
  
         /*
          * step 5d: program XMT margin
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c

index d4022450b73f7a9c6d8d528fbe44b3dd2fa0e00a..ac1bf4a73571ff72d1612644618c14403611d585 100644 (file)
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -1952,13 +1952,17 @@ int init_pervl_scs(struct hfi1_devdata *dd)
         dd->vld[15].sc = sc_alloc(dd, SC_VL15,
                                   dd->rcd[0]->rcvhdrqentsize, dd->node);
         if (!dd->vld[15].sc)
-               goto nomem;
+               return -ENOMEM;
+
         hfi1_init_ctxt(dd->vld[15].sc);
         dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
  
-       dd->kernel_send_context = kmalloc_node(dd->num_send_contexts *
+       dd->kernel_send_context = kzalloc_node(dd->num_send_contexts *
                                         sizeof(struct send_context *),
                                         GFP_KERNEL, dd->node);
+       if (!dd->kernel_send_context)
+               goto freesc15;
+
         dd->kernel_send_context[0] = dd->vld[15].sc;
  
         for (i = 0; i < num_vls; i++) {
@@ -2010,12 +2014,21 @@ int init_pervl_scs(struct hfi1_devdata *dd)
         if (pio_map_init(dd, ppd->port - 1, num_vls, NULL))
                 goto nomem;
         return 0;
+
  nomem:
-       sc_free(dd->vld[15].sc);
-       for (i = 0; i < num_vls; i++)
+       for (i = 0; i < num_vls; i++) {
                 sc_free(dd->vld[i].sc);
+               dd->vld[i].sc = NULL;
+       }
+
         for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++)
                 sc_free(dd->kernel_send_context[i + 1]);
+
+       kfree(dd->kernel_send_context);
+       dd->kernel_send_context = NULL;
+
+freesc15:
+       sc_free(dd->vld[15].sc);
         return -ENOMEM;
  }
  
diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c

index 03df9322f862984af68e82fca84295ad143ebed1..965c8aef0c604cce98e473bd0c6b9c11e1ad9a40 100644 (file)
--- a/drivers/infiniband/hw/hfi1/platform.c
+++ b/drivers/infiniband/hw/hfi1/platform.c
@@ -537,20 +537,6 @@ static void apply_tunings(
         u8 precur = 0, attn = 0, postcur = 0, external_device_config = 0;
         u8 *cache = ppd->qsfp_info.cache;
  
-       /* Enable external device config if channel is limiting active */
-       read_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
-                        GENERAL_CONFIG, &config_data);
-       config_data &= ~(0xff << ENABLE_EXT_DEV_CONFIG_SHIFT);
-       config_data |= ((u32)limiting_active << ENABLE_EXT_DEV_CONFIG_SHIFT);
-       ret = load_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
-                              GENERAL_CONFIG, config_data);
-       if (ret != HCMD_SUCCESS)
-               dd_dev_err(
-                       ppd->dd,
-                       "%s: Failed to set enable external device config\n",
-                       __func__);
-
-       config_data = 0; /* re-init  */
         /* Pass tuning method to 8051 */
         read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
                          &config_data);
@@ -638,9 +624,13 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
         if (ret)
                 return ret;
  
+       /*
+        * We'll change the QSFP memory contents from here on out, thus we set a
+        * flag here to remind ourselves to reset the QSFP module. This prevents
+        * reuse of stale settings established in our previous pass through.
+        */
         if (ppd->qsfp_info.reset_needed) {
                 reset_qsfp(ppd);
-               ppd->qsfp_info.reset_needed = 0;
                 refresh_qsfp_cache(ppd, &ppd->qsfp_info);
         } else {
                 ppd->qsfp_info.reset_needed = 1;
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c

index 1a942ffba4cb0880d53b40eaae6446ffa2b05024..a5aa3517e7d5c537d36786fd0d512cef6d60544c 100644 (file)
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -52,6 +52,7 @@
  #include <linux/seq_file.h>
  #include <rdma/rdma_vt.h>
  #include <rdma/rdmavt_qp.h>
+#include <rdma/ib_verbs.h>
  
  #include "hfi.h"
  #include "qp.h"
@@ -115,6 +116,66 @@ static const u16 credit_table[31] = {
         32768                   /* 1E */
  };
  
+const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
+[IB_WR_RDMA_WRITE] = {
+       .length = sizeof(struct ib_rdma_wr),
+       .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_RDMA_READ] = {
+       .length = sizeof(struct ib_rdma_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_ATOMIC,
+},
+
+[IB_WR_ATOMIC_CMP_AND_SWP] = {
+       .length = sizeof(struct ib_atomic_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_ATOMIC_FETCH_AND_ADD] = {
+       .length = sizeof(struct ib_atomic_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_RDMA_WRITE_WITH_IMM] = {
+       .length = sizeof(struct ib_rdma_wr),
+       .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND] = {
+       .length = sizeof(struct ib_send_wr),
+       .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+                      BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND_WITH_IMM] = {
+       .length = sizeof(struct ib_send_wr),
+       .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+                      BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_REG_MR] = {
+       .length = sizeof(struct ib_reg_wr),
+       .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_LOCAL,
+},
+
+[IB_WR_LOCAL_INV] = {
+       .length = sizeof(struct ib_send_wr),
+       .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_LOCAL,
+},
+
+[IB_WR_SEND_WITH_INV] = {
+       .length = sizeof(struct ib_send_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+},
+
+};
+
  static void flush_tx_list(struct rvt_qp *qp)
  {
         struct hfi1_qp_priv *priv = qp->priv;
@@ -745,8 +806,9 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp,
  
         priv->owner = qp;
  
-       priv->s_hdr = kzalloc_node(sizeof(*priv->s_hdr), gfp, rdi->dparms.node);
-       if (!priv->s_hdr) {
+       priv->s_ahg = kzalloc_node(sizeof(*priv->s_ahg), gfp,
+                                  rdi->dparms.node);
+       if (!priv->s_ahg) {
                 kfree(priv);
                 return ERR_PTR(-ENOMEM);
         }
@@ -759,7 +821,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
  {
         struct hfi1_qp_priv *priv = qp->priv;
  
-       kfree(priv->s_hdr);
+       kfree(priv->s_ahg);
         kfree(priv);
  }
  
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h

index e7bc8d6cf681cd4faca8896481b030b6a48c3aec..587d84d65bb8124656e0ad2b56686f6ac964978a 100644 (file)
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -54,6 +54,8 @@
  
  extern unsigned int hfi1_qp_table_size;
  
+extern const struct rvt_operation_params hfi1_post_parms[];
+
  /*
   * free_ahg - clear ahg from QP
   */
@@ -61,7 +63,7 @@ static inline void clear_ahg(struct rvt_qp *qp)
  {
         struct hfi1_qp_priv *priv = qp->priv;
  
-       priv->s_hdr->ahgcount = 0;
+       priv->s_ahg->ahgcount = 0;
         qp->s_flags &= ~(RVT_S_AHG_VALID | RVT_S_AHG_CLEAR);
         if (priv->s_sde && qp->s_ahgidx >= 0)
                 sdma_ahg_free(priv->s_sde, qp->s_ahgidx);
diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c

index 9fb561682c661b92e2a09beca2b53daf9f8c5b5e..a207717ade2aac0da5fbf34e8adb70a53efc2a42 100644 (file)
--- a/drivers/infiniband/hw/hfi1/qsfp.c
+++ b/drivers/infiniband/hw/hfi1/qsfp.c
@@ -50,46 +50,285 @@
  #include <linux/vmalloc.h>
  
  #include "hfi.h"
-#include "twsi.h"
+
+/* for the given bus number, return the CSR for reading an i2c line */
+static inline u32 i2c_in_csr(u32 bus_num)
+{
+       return bus_num ? ASIC_QSFP2_IN : ASIC_QSFP1_IN;
+}
+
+/* for the given bus number, return the CSR for writing an i2c line */
+static inline u32 i2c_oe_csr(u32 bus_num)
+{
+       return bus_num ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
+}
+
+static void hfi1_setsda(void *data, int state)
+{
+       struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+       struct hfi1_devdata *dd = bus->controlling_dd;
+       u64 reg;
+       u32 target_oe;
+
+       target_oe = i2c_oe_csr(bus->num);
+       reg = read_csr(dd, target_oe);
+       /*
+        * The OE bit value is inverted and connected to the pin.  When
+        * OE is 0 the pin is left to be pulled up, when the OE is 1
+        * the pin is driven low.  This matches the "open drain" or "open
+        * collector" convention.
+        */
+       if (state)
+               reg &= ~QSFP_HFI0_I2CDAT;
+       else
+               reg |= QSFP_HFI0_I2CDAT;
+       write_csr(dd, target_oe, reg);
+       /* do a read to force the write into the chip */
+       (void)read_csr(dd, target_oe);
+}
+
+static void hfi1_setscl(void *data, int state)
+{
+       struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+       struct hfi1_devdata *dd = bus->controlling_dd;
+       u64 reg;
+       u32 target_oe;
+
+       target_oe = i2c_oe_csr(bus->num);
+       reg = read_csr(dd, target_oe);
+       /*
+        * The OE bit value is inverted and connected to the pin.  When
+        * OE is 0 the pin is left to be pulled up, when the OE is 1
+        * the pin is driven low.  This matches the "open drain" or "open
+        * collector" convention.
+        */
+       if (state)
+               reg &= ~QSFP_HFI0_I2CCLK;
+       else
+               reg |= QSFP_HFI0_I2CCLK;
+       write_csr(dd, target_oe, reg);
+       /* do a read to force the write into the chip */
+       (void)read_csr(dd, target_oe);
+}
+
+static int hfi1_getsda(void *data)
+{
+       struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+       u64 reg;
+       u32 target_in;
+
+       hfi1_setsda(data, 1);   /* clear OE so we do not pull line down */
+       udelay(2);              /* 1us pull up + 250ns hold */
+
+       target_in = i2c_in_csr(bus->num);
+       reg = read_csr(bus->controlling_dd, target_in);
+       return !!(reg & QSFP_HFI0_I2CDAT);
+}
+
+static int hfi1_getscl(void *data)
+{
+       struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+       u64 reg;
+       u32 target_in;
+
+       hfi1_setscl(data, 1);   /* clear OE so we do not pull line down */
+       udelay(2);              /* 1us pull up + 250ns hold */
+
+       target_in = i2c_in_csr(bus->num);
+       reg = read_csr(bus->controlling_dd, target_in);
+       return !!(reg & QSFP_HFI0_I2CCLK);
+}
  
  /*
- * QSFP support for hfi driver, using "Two Wire Serial Interface" driver
- * in twsi.c
+ * Allocate and initialize the given i2c bus number.
+ * Returns NULL on failure.
   */
-#define I2C_MAX_RETRY 4
+static struct hfi1_i2c_bus *init_i2c_bus(struct hfi1_devdata *dd,
+                                        struct hfi1_asic_data *ad, int num)
+{
+       struct hfi1_i2c_bus *bus;
+       int ret;
+
+       bus = kzalloc(sizeof(*bus), GFP_KERNEL);
+       if (!bus)
+               return NULL;
+
+       bus->controlling_dd = dd;
+       bus->num = num; /* our bus number */
+
+       bus->algo.setsda = hfi1_setsda;
+       bus->algo.setscl = hfi1_setscl;
+       bus->algo.getsda = hfi1_getsda;
+       bus->algo.getscl = hfi1_getscl;
+       bus->algo.udelay = 5;
+       bus->algo.timeout = usecs_to_jiffies(50);
+       bus->algo.data = bus;
+
+       bus->adapter.owner = THIS_MODULE;
+       bus->adapter.algo_data = &bus->algo;
+       bus->adapter.dev.parent = &dd->pcidev->dev;
+       snprintf(bus->adapter.name, sizeof(bus->adapter.name),
+                "hfi1_i2c%d", num);
+
+       ret = i2c_bit_add_bus(&bus->adapter);
+       if (ret) {
+               dd_dev_info(dd, "%s: unable to add i2c bus %d, err %d\n",
+                           __func__, num, ret);
+               kfree(bus);
+               return NULL;
+       }
+
+       return bus;
+}
  
  /*
- * Raw i2c write.  No set-up or lock checking.
+ * Initialize i2c buses.
+ * Return 0 on success, -errno on error.
   */
-static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
-                      int offset, void *bp, int len)
+int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad)
  {
-       struct hfi1_devdata *dd = ppd->dd;
-       int ret, cnt;
-       u8 *buff = bp;
+       ad->i2c_bus0 = init_i2c_bus(dd, ad, 0);
+       ad->i2c_bus1 = init_i2c_bus(dd, ad, 1);
+       if (!ad->i2c_bus0 || !ad->i2c_bus1)
+               return -ENOMEM;
+       return 0;
+};
  
-       cnt = 0;
-       while (cnt < len) {
-               int wlen = len - cnt;
+static void clean_i2c_bus(struct hfi1_i2c_bus *bus)
+{
+       if (bus) {
+               i2c_del_adapter(&bus->adapter);
+               kfree(bus);
+       }
+}
  
-               ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset,
-                                      buff + cnt, wlen);
-               if (ret) {
-                       /* hfi1_twsi_blk_wr() 1 for error, else 0 */
-                       return -EIO;
-               }
-               offset += wlen;
-               cnt += wlen;
+void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad)
+{
+       clean_i2c_bus(ad->i2c_bus0);
+       ad->i2c_bus0 = NULL;
+       clean_i2c_bus(ad->i2c_bus1);
+       ad->i2c_bus1 = NULL;
+}
+
+static int i2c_bus_write(struct hfi1_devdata *dd, struct hfi1_i2c_bus *i2c,
+                        u8 slave_addr, int offset, int offset_size,
+                        u8 *data, u16 len)
+{
+       int ret;
+       int num_msgs;
+       u8 offset_bytes[2];
+       struct i2c_msg msgs[2];
+
+       switch (offset_size) {
+       case 0:
+               num_msgs = 1;
+               msgs[0].addr = slave_addr;
+               msgs[0].flags = 0;
+               msgs[0].len = len;
+               msgs[0].buf = data;
+               break;
+       case 2:
+               offset_bytes[1] = (offset >> 8) & 0xff;
+               /* fall through */
+       case 1:
+               num_msgs = 2;
+               offset_bytes[0] = offset & 0xff;
+
+               msgs[0].addr = slave_addr;
+               msgs[0].flags = 0;
+               msgs[0].len = offset_size;
+               msgs[0].buf = offset_bytes;
+
+               msgs[1].addr = slave_addr;
+               msgs[1].flags = I2C_M_NOSTART,
+               msgs[1].len = len;
+               msgs[1].buf = data;
+               break;
+       default:
+               return -EINVAL;
         }
  
-       /* Must wait min 20us between qsfp i2c transactions */
-       udelay(20);
+       i2c->controlling_dd = dd;
+       ret = i2c_transfer(&i2c->adapter, msgs, num_msgs);
+       if (ret != num_msgs) {
+               dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; write failed, ret %d\n",
+                          __func__, i2c->num, slave_addr, offset, len, ret);
+               return ret < 0 ? ret : -EIO;
+       }
+       return 0;
+}
+
+static int i2c_bus_read(struct hfi1_devdata *dd, struct hfi1_i2c_bus *bus,
+                       u8 slave_addr, int offset, int offset_size,
+                       u8 *data, u16 len)
+{
+       int ret;
+       int num_msgs;
+       u8 offset_bytes[2];
+       struct i2c_msg msgs[2];
+
+       switch (offset_size) {
+       case 0:
+               num_msgs = 1;
+               msgs[0].addr = slave_addr;
+               msgs[0].flags = I2C_M_RD;
+               msgs[0].len = len;
+               msgs[0].buf = data;
+               break;
+       case 2:
+               offset_bytes[1] = (offset >> 8) & 0xff;
+               /* fall through */
+       case 1:
+               num_msgs = 2;
+               offset_bytes[0] = offset & 0xff;
+
+               msgs[0].addr = slave_addr;
+               msgs[0].flags = 0;
+               msgs[0].len = offset_size;
+               msgs[0].buf = offset_bytes;
+
+               msgs[1].addr = slave_addr;
+               msgs[1].flags = I2C_M_RD,
+               msgs[1].len = len;
+               msgs[1].buf = data;
+               break;
+       default:
+               return -EINVAL;
+       }
  
-       return cnt;
+       bus->controlling_dd = dd;
+       ret = i2c_transfer(&bus->adapter, msgs, num_msgs);
+       if (ret != num_msgs) {
+               dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; read failed, ret %d\n",
+                          __func__, bus->num, slave_addr, offset, len, ret);
+               return ret < 0 ? ret : -EIO;
+       }
+       return 0;
+}
+
+/*
+ * Raw i2c write.  No set-up or lock checking.
+ *
+ * Return 0 on success, -errno on error.
+ */
+static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+                      int offset, void *bp, int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct hfi1_i2c_bus *bus;
+       u8 slave_addr;
+       int offset_size;
+
+       bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0;
+       slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */
+       offset_size = (i2c_addr >> 8) & 0x3;
+       return i2c_bus_write(dd, bus, slave_addr, offset, offset_size, bp, len);
  }
  
  /*
   * Caller must hold the i2c chain resource.
+ *
+ * Return number of bytes written, or -errno.
   */
  int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
               void *bp, int len)
@@ -99,63 +338,36 @@ int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
         if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
                 return -EACCES;
  
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "I2C chain %d write interface reset failed\n",
-                                target);
+       ret = __i2c_write(ppd, target, i2c_addr, offset, bp, len);
+       if (ret)
                 return ret;
-       }
  
-       return __i2c_write(ppd, target, i2c_addr, offset, bp, len);
+       return len;
  }
  
  /*
   * Raw i2c read.  No set-up or lock checking.
+ *
+ * Return 0 on success, -errno on error.
   */
  static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
                       int offset, void *bp, int len)
  {
         struct hfi1_devdata *dd = ppd->dd;
-       int ret, cnt, pass = 0;
-       int orig_offset = offset;
-
-       cnt = 0;
-       while (cnt < len) {
-               int rlen = len - cnt;
-
-               ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset,
-                                      bp + cnt, rlen);
-               /* Some QSFP's fail first try. Retry as experiment */
-               if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY)
-                       continue;
-               if (ret) {
-                       /* hfi1_twsi_blk_rd() 1 for error, else 0 */
-                       ret = -EIO;
-                       goto exit;
-               }
-               offset += rlen;
-               cnt += rlen;
-       }
-
-       ret = cnt;
-
-exit:
-       if (ret < 0) {
-               hfi1_dev_porterr(dd, ppd->port,
-                                "I2C chain %d read failed, addr 0x%x, offset 0x%x, len %d\n",
-                                target, i2c_addr, orig_offset, len);
-       }
-
-       /* Must wait min 20us between qsfp i2c transactions */
-       udelay(20);
-
-       return ret;
+       struct hfi1_i2c_bus *bus;
+       u8 slave_addr;
+       int offset_size;
+
+       bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0;
+       slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */
+       offset_size = (i2c_addr >> 8) & 0x3;
+       return i2c_bus_read(dd, bus, slave_addr, offset, offset_size, bp, len);
  }
  
  /*
   * Caller must hold the i2c chain resource.
+ *
+ * Return number of bytes read, or -errno.
   */
  int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
              void *bp, int len)
@@ -165,16 +377,11 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
         if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
                 return -EACCES;
  
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "I2C chain %d read interface reset failed\n",
-                                target);
+       ret = __i2c_read(ppd, target, i2c_addr, offset, bp, len);
+       if (ret)
                 return ret;
-       }
  
-       return __i2c_read(ppd, target, i2c_addr, offset, bp, len);
+       return len;
  }
  
  /*
@@ -182,6 +389,8 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
   * by writing @addr = ((256 * n) + m)
   *
   * Caller must hold the i2c chain resource.
+ *
+ * Return number of bytes written or -errno.
   */
  int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
                int len)
@@ -189,21 +398,12 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
         int count = 0;
         int offset;
         int nwrite;
-       int ret;
+       int ret = 0;
         u8 page;
  
         if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
                 return -EACCES;
  
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "QSFP chain %d write interface reset failed\n",
-                                target);
-               return ret;
-       }
-
         while (count < len) {
                 /*
                  * Set the qsfp page based on a zero-based address
@@ -213,11 +413,12 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
  
                 ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
                                   QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
-               if (ret != 1) {
+               /* QSFPs require a 5-10msec delay after write operations */
+               mdelay(5);
+               if (ret) {
                         hfi1_dev_porterr(ppd->dd, ppd->port,
                                          "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
                                          target, ret);
-                       ret = -EIO;
                         break;
                 }
  
@@ -229,11 +430,13 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
  
                 ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
                                   offset, bp + count, nwrite);
-               if (ret <= 0)   /* stop on error or nothing written */
+               /* QSFPs require a 5-10msec delay after write operations */
+               mdelay(5);
+               if (ret)        /* stop on error */
                         break;
  
-               count += ret;
-               addr += ret;
+               count += nwrite;
+               addr += nwrite;
         }
  
         if (ret < 0)
@@ -243,7 +446,7 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
  
  /*
   * Perform a stand-alone single QSFP write.  Acquire the resource, do the
- * read, then release the resource.
+ * write, then release the resource.
   */
  int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
                    int len)
@@ -266,6 +469,8 @@ int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
   * by reading @addr = ((256 * n) + m)
   *
   * Caller must hold the i2c chain resource.
+ *
+ * Return the number of bytes read or -errno.
   */
  int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
               int len)
@@ -273,21 +478,12 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
         int count = 0;
         int offset;
         int nread;
-       int ret;
+       int ret = 0;
         u8 page;
  
         if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
                 return -EACCES;
  
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "QSFP chain %d read interface reset failed\n",
-                                target);
-               return ret;
-       }
-
         while (count < len) {
                 /*
                  * Set the qsfp page based on a zero-based address
@@ -296,11 +492,12 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
                 page = (u8)(addr / QSFP_PAGESIZE);
                 ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
                                   QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
-               if (ret != 1) {
+               /* QSFPs require a 5-10msec delay after write operations */
+               mdelay(5);
+               if (ret) {
                         hfi1_dev_porterr(ppd->dd, ppd->port,
                                          "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
                                          target, ret);
-                       ret = -EIO;
                         break;
                 }
  
@@ -310,15 +507,13 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
                 if (((addr % QSFP_RW_BOUNDARY) + nread) > QSFP_RW_BOUNDARY)
                         nread = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
  
-               /* QSFPs require a 5-10msec delay after write operations */
-               mdelay(5);
                 ret = __i2c_read(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
                                  offset, bp + count, nread);
-               if (ret <= 0)   /* stop on error or nothing read */
+               if (ret)        /* stop on error */
                         break;
  
-               count += ret;
-               addr += ret;
+               count += nread;
+               addr += nread;
         }
  
         if (ret < 0)
diff --git a/drivers/infiniband/hw/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h

index dadc66c442b982130da735bcb7bcf1e81307590f..69275ebd9597322b599e97fce5804d17efa526f1 100644 (file)
--- a/drivers/infiniband/hw/hfi1/qsfp.h
+++ b/drivers/infiniband/hw/hfi1/qsfp.h
@@ -238,3 +238,6 @@ int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
                    int len);
  int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
                   int len);
+struct hfi1_asic_data;
+int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad);
+void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad);
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c

index 792f15eb8efeceeff89cd40e05e4180dace441b2..5da190e6011b1390f7861b2bd89a5a09e7976c14 100644 (file)
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -477,6 +477,37 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                                 qp->s_flags |= RVT_S_WAIT_FENCE;
                                 goto bail;
                         }
+                       /*
+                        * Local operations are processed immediately
+                        * after all prior requests have completed
+                        */
+                       if (wqe->wr.opcode == IB_WR_REG_MR ||
+                           wqe->wr.opcode == IB_WR_LOCAL_INV) {
+                               int local_ops = 0;
+                               int err = 0;
+
+                               if (qp->s_last != qp->s_cur)
+                                       goto bail;
+                               if (++qp->s_cur == qp->s_size)
+                                       qp->s_cur = 0;
+                               if (++qp->s_tail == qp->s_size)
+                                       qp->s_tail = 0;
+                               if (!(wqe->wr.send_flags &
+                                     RVT_SEND_COMPLETION_ONLY)) {
+                                       err = rvt_invalidate_rkey(
+                                               qp,
+                                               wqe->wr.ex.invalidate_rkey);
+                                       local_ops = 1;
+                               }
+                               hfi1_send_complete(qp, wqe,
+                                                  err ? IB_WC_LOC_PROT_ERR
+                                                      : IB_WC_SUCCESS);
+                               if (local_ops)
+                                       atomic_dec(&qp->local_ops_pending);
+                               qp->s_hdrwords = 0;
+                               goto done_free_tx;
+                       }
+
                         newreq = 1;
                         qp->s_psn = wqe->psn;
                 }
@@ -491,6 +522,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                 switch (wqe->wr.opcode) {
                 case IB_WR_SEND:
                 case IB_WR_SEND_WITH_IMM:
+               case IB_WR_SEND_WITH_INV:
                         /* If no credit, return. */
                         if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
                             cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
@@ -504,11 +536,17 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                         }
                         if (wqe->wr.opcode == IB_WR_SEND) {
                                 qp->s_state = OP(SEND_ONLY);
-                       } else {
+                       } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
                                 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
                                 /* Immediate data comes after the BTH */
                                 ohdr->u.imm_data = wqe->wr.ex.imm_data;
                                 hwords += 1;
+                       } else {
+                               qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE);
+                               /* Invalidate rkey comes after the BTH */
+                               ohdr->u.ieth = cpu_to_be32(
+                                               wqe->wr.ex.invalidate_rkey);
+                               hwords += 1;
                         }
                         if (wqe->wr.send_flags & IB_SEND_SOLICITED)
                                 bth0 |= IB_BTH_SOLICITED;
@@ -671,11 +709,16 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                 }
                 if (wqe->wr.opcode == IB_WR_SEND) {
                         qp->s_state = OP(SEND_LAST);
-               } else {
+               } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
                         qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
                         /* Immediate data comes after the BTH */
                         ohdr->u.imm_data = wqe->wr.ex.imm_data;
                         hwords += 1;
+               } else {
+                       qp->s_state = OP(SEND_LAST_WITH_INVALIDATE);
+                       /* invalidate data comes after the BTH */
+                       ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey);
+                       hwords += 1;
                 }
                 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
                         bth0 |= IB_BTH_SOLICITED;
@@ -1047,7 +1090,7 @@ void hfi1_rc_timeout(unsigned long arg)
                 ibp->rvp.n_rc_timeouts++;
                 qp->s_flags &= ~RVT_S_TIMER;
                 del_timer(&qp->s_timer);
-               trace_hfi1_rc_timeout(qp, qp->s_last_psn + 1);
+               trace_hfi1_timeout(qp, qp->s_last_psn + 1);
                 restart_rc(qp, qp->s_last_psn + 1, 1);
                 hfi1_schedule_send(qp);
         }
@@ -1171,7 +1214,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr)
          * If we were waiting for sends to complete before re-sending,
          * and they are now complete, restart sending.
          */
-       trace_hfi1_rc_sendcomplete(qp, psn);
+       trace_hfi1_sendcomplete(qp, psn);
         if (qp->s_flags & RVT_S_WAIT_PSN &&
             cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
                 qp->s_flags &= ~RVT_S_WAIT_PSN;
@@ -1567,7 +1610,7 @@ static void rc_rcv_resp(struct hfi1_ibport *ibp,
  
         spin_lock_irqsave(&qp->s_lock, flags);
  
-       trace_hfi1_rc_ack(qp, psn);
+       trace_hfi1_ack(qp, psn);
  
         /* Ignore invalid responses. */
         smp_read_barrier_depends(); /* see post_one_send */
@@ -1782,7 +1825,7 @@ static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data,
         u8 i, prev;
         int old_req;
  
-       trace_hfi1_rc_rcv_error(qp, psn);
+       trace_hfi1_rcv_error(qp, psn);
         if (diff > 0) {
                 /*
                  * Packet sequence error.
@@ -2086,7 +2129,6 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
         u32 tlen = packet->tlen;
         struct rvt_qp *qp = packet->qp;
         struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
         struct hfi1_other_headers *ohdr = packet->ohdr;
         u32 bth0, opcode;
         u32 hdrsize = packet->hlen;
@@ -2097,30 +2139,15 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
         int diff;
         struct ib_reth *reth;
         unsigned long flags;
-       u32 bth1;
         int ret, is_fecn = 0;
         int copy_last = 0;
+       u32 rkey;
  
         bth0 = be32_to_cpu(ohdr->bth[0]);
         if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0))
                 return;
  
-       bth1 = be32_to_cpu(ohdr->bth[1]);
-       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
-               if (bth1 & HFI1_BECN_SMASK) {
-                       u16 rlid = qp->remote_ah_attr.dlid;
-                       u32 lqpn, rqpn;
-
-                       lqpn = qp->ibqp.qp_num;
-                       rqpn = qp->remote_qpn;
-                       process_becn(
-                               ppd,
-                               qp->remote_ah_attr.sl,
-                               rlid, lqpn, rqpn,
-                               IB_CC_SVCTYPE_RC);
-               }
-               is_fecn = bth1 & HFI1_FECN_SMASK;
-       }
+       is_fecn = process_ecn(qp, packet, false);
  
         psn = be32_to_cpu(ohdr->bth[2]);
         opcode = (bth0 >> 24) & 0xff;
@@ -2154,7 +2181,8 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
         case OP(SEND_MIDDLE):
                 if (opcode == OP(SEND_MIDDLE) ||
                     opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+                   opcode == OP(SEND_LAST_WITH_INVALIDATE))
                         break;
                 goto nack_inv;
  
@@ -2170,6 +2198,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
                 if (opcode == OP(SEND_MIDDLE) ||
                     opcode == OP(SEND_LAST) ||
                     opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+                   opcode == OP(SEND_LAST_WITH_INVALIDATE) ||
                     opcode == OP(RDMA_WRITE_MIDDLE) ||
                     opcode == OP(RDMA_WRITE_LAST) ||
                     opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
@@ -2218,6 +2247,7 @@ send_middle:
  
         case OP(SEND_ONLY):
         case OP(SEND_ONLY_WITH_IMMEDIATE):
+       case OP(SEND_ONLY_WITH_INVALIDATE):
                 ret = hfi1_rvt_get_rwqe(qp, 0);
                 if (ret < 0)
                         goto nack_op_err;
@@ -2226,12 +2256,22 @@ send_middle:
                 qp->r_rcv_len = 0;
                 if (opcode == OP(SEND_ONLY))
                         goto no_immediate_data;
+               if (opcode == OP(SEND_ONLY_WITH_INVALIDATE))
+                       goto send_last_inv;
                 /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
         case OP(SEND_LAST_WITH_IMMEDIATE):
  send_last_imm:
                 wc.ex.imm_data = ohdr->u.imm_data;
                 wc.wc_flags = IB_WC_WITH_IMM;
                 goto send_last;
+       case OP(SEND_LAST_WITH_INVALIDATE):
+send_last_inv:
+               rkey = be32_to_cpu(ohdr->u.ieth);
+               if (rvt_invalidate_rkey(qp, rkey))
+                       goto no_immediate_data;
+               wc.ex.invalidate_rkey = rkey;
+               wc.wc_flags = IB_WC_WITH_INVALIDATE;
+               goto send_last;
         case OP(RDMA_WRITE_LAST):
                 copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
                 /* fall through */
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c

index a659aec3c3c6b95823650c7b6bb3ac800649967a..48d5094f98e259b92fade72c37799f43ecfcec4e 100644 (file)
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -372,6 +372,7 @@ static void ruc_loopback(struct rvt_qp *sqp)
         int ret;
         int copy_last = 0;
         u32 to;
+       int local_ops = 0;
  
         rcu_read_lock();
  
@@ -440,11 +441,31 @@ again:
         sqp->s_sge.num_sge = wqe->wr.num_sge;
         sqp->s_len = wqe->length;
         switch (wqe->wr.opcode) {
+       case IB_WR_REG_MR:
+               goto send_comp;
+
+       case IB_WR_LOCAL_INV:
+               if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
+                       if (rvt_invalidate_rkey(sqp,
+                                               wqe->wr.ex.invalidate_rkey))
+                               send_status = IB_WC_LOC_PROT_ERR;
+                       local_ops = 1;
+               }
+               goto send_comp;
+
+       case IB_WR_SEND_WITH_INV:
+               if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) {
+                       wc.wc_flags = IB_WC_WITH_INVALIDATE;
+                       wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey;
+               }
+               goto send;
+
         case IB_WR_SEND_WITH_IMM:
                 wc.wc_flags = IB_WC_WITH_IMM;
                 wc.ex.imm_data = wqe->wr.ex.imm_data;
                 /* FALLTHROUGH */
         case IB_WR_SEND:
+send:
                 ret = hfi1_rvt_get_rwqe(qp, 0);
                 if (ret < 0)
                         goto op_err;
@@ -583,6 +604,10 @@ send_comp:
  flush_send:
         sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
         hfi1_send_complete(sqp, wqe, send_status);
+       if (local_ops) {
+               atomic_dec(&sqp->local_ops_pending);
+               local_ops = 0;
+       }
         goto again;
  
  rnr_nak:
@@ -683,10 +708,10 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
         return sizeof(struct ib_grh) / sizeof(u32);
  }
  
-#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4)
+#define BTH2_OFFSET (offsetof(struct hfi1_sdma_header, hdr.u.oth.bth[2]) / 4)
  
  /**
- * build_ahg - create ahg in s_hdr
+ * build_ahg - create ahg in s_ahg
   * @qp: a pointer to QP
   * @npsn: the next PSN for the request/response
   *
@@ -708,19 +733,18 @@ static inline void build_ahg(struct rvt_qp *qp, u32 npsn)
                         qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde);
                 if (qp->s_ahgidx >= 0) {
                         qp->s_ahgpsn = npsn;
-                       priv->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
+                       priv->s_ahg->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
                         /* save to protect a change in another thread */
-                       priv->s_hdr->sde = priv->s_sde;
-                       priv->s_hdr->ahgidx = qp->s_ahgidx;
+                       priv->s_ahg->ahgidx = qp->s_ahgidx;
                         qp->s_flags |= RVT_S_AHG_VALID;
                 }
         } else {
                 /* subsequent middle after valid */
                 if (qp->s_ahgidx >= 0) {
-                       priv->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG;
-                       priv->s_hdr->ahgidx = qp->s_ahgidx;
-                       priv->s_hdr->ahgcount++;
-                       priv->s_hdr->ahgdesc[0] =
+                       priv->s_ahg->tx_flags |= SDMA_TXREQ_F_USE_AHG;
+                       priv->s_ahg->ahgidx = qp->s_ahgidx;
+                       priv->s_ahg->ahgcount++;
+                       priv->s_ahg->ahgdesc[0] =
                                 sdma_build_ahg_descriptor(
                                         (__force u16)cpu_to_be16((u16)npsn),
                                         BTH2_OFFSET,
@@ -728,8 +752,8 @@ static inline void build_ahg(struct rvt_qp *qp, u32 npsn)
                                         16);
                         if ((npsn & 0xffff0000) !=
                                         (qp->s_ahgpsn & 0xffff0000)) {
-                               priv->s_hdr->ahgcount++;
-                               priv->s_hdr->ahgdesc[1] =
+                               priv->s_ahg->ahgcount++;
+                               priv->s_ahg->ahgdesc[1] =
                                         sdma_build_ahg_descriptor(
                                                 (__force u16)cpu_to_be16(
                                                         (u16)(npsn >> 16)),
@@ -766,7 +790,7 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
         }
         lrh0 |= (priv->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
         /*
-        * reset s_hdr/AHG fields
+        * reset s_ahg/AHG fields
          *
          * This insures that the ahgentry/ahgcount
          * are at a non-AHG default to protect
@@ -776,10 +800,9 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
          * build_ahg() will modify as appropriate
          * to use the AHG feature.
          */
-       priv->s_hdr->tx_flags = 0;
-       priv->s_hdr->ahgcount = 0;
-       priv->s_hdr->ahgidx = 0;
-       priv->s_hdr->sde = NULL;
+       priv->s_ahg->tx_flags = 0;
+       priv->s_ahg->ahgcount = 0;
+       priv->s_ahg->ahgidx = 0;
         if (qp->s_mig_state == IB_MIG_MIGRATED)
                 bth0 |= IB_BTH_MIG_REQ;
         else
@@ -890,7 +913,7 @@ void hfi1_do_send(struct rvt_qp *qp)
                          */
                         if (hfi1_verbs_send(qp, &ps))
                                 return;
-                       /* Record that s_hdr is empty. */
+                       /* Record that s_ahg is empty. */
                         qp->s_hdrwords = 0;
                         /* allow other tasks to run */
                         if (unlikely(time_after(jiffies, timeout))) {
diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c

index 91fc2aed6aed93dbfa3b5d2b8a197ce4537a67a9..74c84c655f7e5c18f2737adfd582429cbd99c2ca 100644 (file)
--- a/drivers/infiniband/hw/hfi1/sysfs.c
+++ b/drivers/infiniband/hw/hfi1/sysfs.c
@@ -49,6 +49,7 @@
  #include "hfi.h"
  #include "mad.h"
  #include "trace.h"
+#include "affinity.h"
  
  /*
   * Start of per-port congestion control structures and support code
@@ -622,6 +623,27 @@ static ssize_t show_tempsense(struct device *device,
         return ret;
  }
  
+static ssize_t show_sdma_affinity(struct device *device,
+                                 struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       return hfi1_get_sdma_affinity(dd, buf);
+}
+
+static ssize_t store_sdma_affinity(struct device *device,
+                                  struct device_attribute *attr,
+                                  const char *buf, size_t count)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       return hfi1_set_sdma_affinity(dd, buf, count);
+}
+
  /*
   * end of per-unit (or driver, in some cases, but replicated
   * per unit) functions
@@ -636,6 +658,8 @@ static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
  static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
  static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
  static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+static DEVICE_ATTR(sdma_affinity, S_IWUSR | S_IRUGO, show_sdma_affinity,
+                  store_sdma_affinity);
  
  static struct device_attribute *hfi1_attributes[] = {
         &dev_attr_hw_rev,
@@ -646,6 +670,7 @@ static struct device_attribute *hfi1_attributes[] = {
         &dev_attr_boardversion,
         &dev_attr_tempsense,
         &dev_attr_chip_reset,
+       &dev_attr_sdma_affinity,
  };
  
  int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h

index 28c1d083288632b9f90042d752732a8a157e3130..92dc88f013c9588b0c0da8d43eb189163c5aad8c 100644 (file)
--- a/drivers/infiniband/hw/hfi1/trace.h
+++ b/drivers/infiniband/hw/hfi1/trace.h
@@ -44,1329 +44,10 @@
   * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   *
   */
-#undef TRACE_SYSTEM_VAR
-#define TRACE_SYSTEM_VAR hfi1
-
-#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
-#define __HFI1_TRACE_H
-
-#include <linux/tracepoint.h>
-#include <linux/trace_seq.h>
-
-#include "hfi.h"
-#include "mad.h"
-#include "sdma.h"
-
-#define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
-#define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
-
-#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
-#define show_packettype(etype)                  \
-__print_symbolic(etype,                         \
-       packettype_name(EXPECTED),              \
-       packettype_name(EAGER),                 \
-       packettype_name(IB),                    \
-       packettype_name(ERROR),                 \
-       packettype_name(BYPASS))
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_rx
-
-TRACE_EVENT(hfi1_rcvhdr,
-           TP_PROTO(struct hfi1_devdata *dd,
-                    u32 ctxt,
-                    u64 eflags,
-                    u32 etype,
-                    u32 hlen,
-                    u32 tlen,
-                    u32 updegr,
-                    u32 etail
-                    ),
-           TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(u64, eflags)
-                            __field(u32, ctxt)
-                            __field(u32, etype)
-                            __field(u32, hlen)
-                            __field(u32, tlen)
-                            __field(u32, updegr)
-                            __field(u32, etail)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->eflags = eflags;
-                          __entry->ctxt = ctxt;
-                          __entry->etype = etype;
-                          __entry->hlen = hlen;
-                          __entry->tlen = tlen;
-                          __entry->updegr = updegr;
-                          __entry->etail = etail;
-                          ),
-           TP_printk(
-                     "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->eflags,
-                     __entry->etype, show_packettype(__entry->etype),
-                     __entry->hlen,
-                     __entry->tlen,
-                     __entry->updegr,
-                     __entry->etail
-                     )
-);
-
-TRACE_EVENT(hfi1_receive_interrupt,
-           TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
-           TP_ARGS(dd, ctxt),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(u32, ctxt)
-                            __field(u8, slow_path)
-                            __field(u8, dma_rtail)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->ctxt = ctxt;
-                          if (dd->rcd[ctxt]->do_interrupt ==
-                              &handle_receive_interrupt) {
-                               __entry->slow_path = 1;
-                               __entry->dma_rtail = 0xFF;
-                          } else if (dd->rcd[ctxt]->do_interrupt ==
-                                     &handle_receive_interrupt_dma_rtail){
-                               __entry->dma_rtail = 1;
-                               __entry->slow_path = 0;
-                          } else if (dd->rcd[ctxt]->do_interrupt ==
-                                     &handle_receive_interrupt_nodma_rtail) {
-                               __entry->dma_rtail = 0;
-                               __entry->slow_path = 0;
-                          }
-                          ),
-           TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d",
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->slow_path,
-                     __entry->dma_rtail
-                     )
-);
-
-TRACE_EVENT(hfi1_exp_tid_reg,
-           TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr,
-                    u32 npages, unsigned long va, unsigned long pa,
-                    dma_addr_t dma),
-           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __field(u32, rarr)
-                   __field(u32, npages)
-                   __field(unsigned long, va)
-                   __field(unsigned long, pa)
-                   __field(dma_addr_t, dma)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->rarr = rarr;
-                   __entry->npages = npages;
-                   __entry->va = va;
-                   __entry->pa = pa;
-                   __entry->dma = dma;
-                   ),
-           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->rarr,
-                     __entry->npages,
-                     __entry->pa,
-                     __entry->va,
-                     __entry->dma
-                   )
-       );
-
-TRACE_EVENT(hfi1_exp_tid_unreg,
-           TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, u32 npages,
-                    unsigned long va, unsigned long pa, dma_addr_t dma),
-           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __field(u32, rarr)
-                   __field(u32, npages)
-                   __field(unsigned long, va)
-                   __field(unsigned long, pa)
-                   __field(dma_addr_t, dma)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->rarr = rarr;
-                   __entry->npages = npages;
-                   __entry->va = va;
-                   __entry->pa = pa;
-                   __entry->dma = dma;
-                   ),
-           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->rarr,
-                     __entry->npages,
-                     __entry->pa,
-                     __entry->va,
-                     __entry->dma
-                   )
-       );
-
-TRACE_EVENT(hfi1_exp_tid_inval,
-           TP_PROTO(unsigned ctxt, u16 subctxt, unsigned long va, u32 rarr,
-                    u32 npages, dma_addr_t dma),
-           TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __field(unsigned long, va)
-                   __field(u32, rarr)
-                   __field(u32, npages)
-                   __field(dma_addr_t, dma)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->va = va;
-                   __entry->rarr = rarr;
-                   __entry->npages = npages;
-                   __entry->dma = dma;
-                   ),
-           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->rarr,
-                     __entry->npages,
-                     __entry->va,
-                     __entry->dma
-                   )
-       );
-
-TRACE_EVENT(hfi1_mmu_invalidate,
-           TP_PROTO(unsigned ctxt, u16 subctxt, const char *type,
-                    unsigned long start, unsigned long end),
-           TP_ARGS(ctxt, subctxt, type, start, end),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __string(type, type)
-                   __field(unsigned long, start)
-                   __field(unsigned long, end)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __assign_str(type, type);
-                   __entry->start = start;
-                   __entry->end = end;
-                   ),
-           TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __get_str(type),
-                     __entry->start,
-                     __entry->end
-                   )
-       );
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_tx
-
-TRACE_EVENT(hfi1_piofree,
-           TP_PROTO(struct send_context *sc, int extra),
-           TP_ARGS(sc, extra),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
-                            __field(u32, sw_index)
-                            __field(u32, hw_context)
-                            __field(int, extra)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
-                          __entry->sw_index = sc->sw_index;
-                          __entry->hw_context = sc->hw_context;
-                          __entry->extra = extra;
-                          ),
-           TP_printk("[%s] ctxt %u(%u) extra %d",
-                     __get_str(dev),
-                     __entry->sw_index,
-                     __entry->hw_context,
-                     __entry->extra
-                     )
-);
-
-TRACE_EVENT(hfi1_wantpiointr,
-           TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
-           TP_ARGS(sc, needint, credit_ctrl),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
-                            __field(u32, sw_index)
-                            __field(u32, hw_context)
-                            __field(u32, needint)
-                            __field(u64, credit_ctrl)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
-                          __entry->sw_index = sc->sw_index;
-                          __entry->hw_context = sc->hw_context;
-                          __entry->needint = needint;
-                          __entry->credit_ctrl = credit_ctrl;
-                          ),
-           TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
-                     __get_str(dev),
-                     __entry->sw_index,
-                     __entry->hw_context,
-                     __entry->needint,
-                     (unsigned long long)__entry->credit_ctrl
-                      )
-);
-
-DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
-                   TP_PROTO(struct rvt_qp *qp, u32 flags),
-                   TP_ARGS(qp, flags),
-                   TP_STRUCT__entry(
-                           DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
-                           __field(u32, qpn)
-                           __field(u32, flags)
-                           __field(u32, s_flags)
-                           ),
-                   TP_fast_assign(
-                           DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
-                           __entry->flags = flags;
-                           __entry->qpn = qp->ibqp.qp_num;
-                           __entry->s_flags = qp->s_flags;
-                           ),
-                   TP_printk(
-                           "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
-                           __get_str(dev),
-                           __entry->qpn,
-                           __entry->flags,
-                           __entry->s_flags
-                           )
-);
-
-DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
-            TP_PROTO(struct rvt_qp *qp, u32 flags),
-            TP_ARGS(qp, flags));
-
-DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
-            TP_PROTO(struct rvt_qp *qp, u32 flags),
-            TP_ARGS(qp, flags));
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_ibhdrs
-
-u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr);
-const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs);
-
-#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
-
-const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1);
-
-#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
-
-#define lrh_name(lrh) { HFI1_##lrh, #lrh }
-#define show_lnh(lrh)                    \
-__print_symbolic(lrh,                    \
-       lrh_name(LRH_BTH),               \
-       lrh_name(LRH_GRH))
-
-#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode  }
-#define show_ib_opcode(opcode)                             \
-__print_symbolic(opcode,                                   \
-       ib_opcode_name(RC_SEND_FIRST),                     \
-       ib_opcode_name(RC_SEND_MIDDLE),                    \
-       ib_opcode_name(RC_SEND_LAST),                      \
-       ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE),       \
-       ib_opcode_name(RC_SEND_ONLY),                      \
-       ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE),       \
-       ib_opcode_name(RC_RDMA_WRITE_FIRST),               \
-       ib_opcode_name(RC_RDMA_WRITE_MIDDLE),              \
-       ib_opcode_name(RC_RDMA_WRITE_LAST),                \
-       ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
-       ib_opcode_name(RC_RDMA_WRITE_ONLY),                \
-       ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
-       ib_opcode_name(RC_RDMA_READ_REQUEST),              \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST),       \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE),      \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST),        \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY),        \
-       ib_opcode_name(RC_ACKNOWLEDGE),                    \
-       ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
-       ib_opcode_name(RC_COMPARE_SWAP),                   \
-       ib_opcode_name(RC_FETCH_ADD),                      \
-       ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE),      \
-       ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE),      \
-       ib_opcode_name(UC_SEND_FIRST),                     \
-       ib_opcode_name(UC_SEND_MIDDLE),                    \
-       ib_opcode_name(UC_SEND_LAST),                      \
-       ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE),       \
-       ib_opcode_name(UC_SEND_ONLY),                      \
-       ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE),       \
-       ib_opcode_name(UC_RDMA_WRITE_FIRST),               \
-       ib_opcode_name(UC_RDMA_WRITE_MIDDLE),              \
-       ib_opcode_name(UC_RDMA_WRITE_LAST),                \
-       ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
-       ib_opcode_name(UC_RDMA_WRITE_ONLY),                \
-       ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
-       ib_opcode_name(UD_SEND_ONLY),                      \
-       ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE),       \
-       ib_opcode_name(CNP))
-
-#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
-#define BTH_PRN \
-       "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
-       "f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
-#define EHDR_PRN "%s"
-
-DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
-                   TP_PROTO(struct hfi1_devdata *dd,
-                            struct hfi1_ib_header *hdr),
-                   TP_ARGS(dd, hdr),
-                   TP_STRUCT__entry(
-                           DD_DEV_ENTRY(dd)
-                           /* LRH */
-                           __field(u8, vl)
-                           __field(u8, lver)
-                           __field(u8, sl)
-                           __field(u8, lnh)
-                           __field(u16, dlid)
-                           __field(u16, len)
-                           __field(u16, slid)
-                           /* BTH */
-                           __field(u8, opcode)
-                           __field(u8, se)
-                           __field(u8, m)
-                           __field(u8, pad)
-                           __field(u8, tver)
-                           __field(u16, pkey)
-                           __field(u8, f)
-                           __field(u8, b)
-                           __field(u32, qpn)
-                           __field(u8, a)
-                           __field(u32, psn)
-                           /* extended headers */
-                           __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
-                           ),
-                   TP_fast_assign(
-                          struct hfi1_other_headers *ohdr;
-
-                          DD_DEV_ASSIGN(dd);
-                          /* LRH */
-                          __entry->vl =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
-                          __entry->lver =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
-                          __entry->sl =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
-                          __entry->lnh =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
-                          __entry->dlid =
-                          be16_to_cpu(hdr->lrh[1]);
-                          /* allow for larger len */
-                          __entry->len =
-                          be16_to_cpu(hdr->lrh[2]);
-                          __entry->slid =
-                          be16_to_cpu(hdr->lrh[3]);
-                          /* BTH */
-                          if (__entry->lnh == HFI1_LRH_BTH)
-                               ohdr = &hdr->u.oth;
-                          else
-                               ohdr = &hdr->u.l.oth;
-                         __entry->opcode =
-                         (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
-                         __entry->se =
-                         (be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
-                         __entry->m =
-                         (be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
-                         __entry->pad =
-                         (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-                         __entry->tver =
-                         (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
-                         __entry->pkey =
-                         be32_to_cpu(ohdr->bth[0]) & 0xffff;
-                         __entry->f =
-                         (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) &
-                         HFI1_FECN_MASK;
-                         __entry->b =
-                         (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) &
-                         HFI1_BECN_MASK;
-                         __entry->qpn =
-                         be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-                         __entry->a =
-                         (be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
-                         /* allow for larger PSN */
-                         __entry->psn =
-                         be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
-                         /* extended headers */
-                         memcpy(__get_dynamic_array(ehdrs), &ohdr->u,
-                                ibhdr_exhdr_len(hdr));
-                        ),
-                   TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
-                             __get_str(dev),
-                             /* LRH */
-                             __entry->vl,
-                             __entry->lver,
-                             __entry->sl,
-                             __entry->lnh, show_lnh(__entry->lnh),
-                             __entry->dlid,
-                             __entry->len,
-                             __entry->slid,
-                             /* BTH */
-                             __entry->opcode, show_ib_opcode(__entry->opcode),
-                             __entry->se,
-                             __entry->m,
-                             __entry->pad,
-                             __entry->tver,
-                             __entry->pkey,
-                             __entry->f,
-                             __entry->b,
-                             __entry->qpn,
-                             __entry->a,
-                             __entry->psn,
-                             /* extended headers */
-                             __parse_ib_ehdrs(
-                                       __entry->opcode,
-                                       (void *)__get_dynamic_array(ehdrs))
-                            )
-);
-
-DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-#define SNOOP_PRN \
-       "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \
-       "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]"
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_snoop
-
-TRACE_EVENT(snoop_capture,
-           TP_PROTO(struct hfi1_devdata *dd,
-                    int hdr_len,
-                    struct hfi1_ib_header *hdr,
-                    int data_len,
-                    void *data),
-           TP_ARGS(dd, hdr_len, hdr, data_len, data),
-           TP_STRUCT__entry(
-               DD_DEV_ENTRY(dd)
-               __field(u16, slid)
-               __field(u16, dlid)
-               __field(u32, qpn)
-               __field(u8, opcode)
-               __field(u8, sl)
-               __field(u16, pkey)
-               __field(u32, hdr_len)
-               __field(u32, data_len)
-               __field(u8, lnh)
-               __dynamic_array(u8, raw_hdr, hdr_len)
-               __dynamic_array(u8, raw_pkt, data_len)
-               ),
-           TP_fast_assign(
-               struct hfi1_other_headers *ohdr;
-
-               __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
-               if (__entry->lnh == HFI1_LRH_BTH)
-                       ohdr = &hdr->u.oth;
-               else
-                       ohdr = &hdr->u.l.oth;
-               DD_DEV_ASSIGN(dd);
-               __entry->slid = be16_to_cpu(hdr->lrh[3]);
-               __entry->dlid = be16_to_cpu(hdr->lrh[1]);
-               __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-               __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
-               __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
-               __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff;
-               __entry->hdr_len = hdr_len;
-               __entry->data_len = data_len;
-               memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len);
-               memcpy(__get_dynamic_array(raw_pkt), data, data_len);
-               ),
-           TP_printk(
-               "[%s] " SNOOP_PRN,
-               __get_str(dev),
-               __entry->slid,
-               __entry->dlid,
-               __entry->qpn,
-               __entry->opcode,
-               show_ib_opcode(__entry->opcode),
-               __entry->sl,
-               __entry->pkey,
-               __entry->hdr_len,
-               __entry->data_len
-               )
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_ctxts
-
-#define UCTXT_FMT \
-       "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, "     \
-       "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx"
-TRACE_EVENT(hfi1_uctxtdata,
-           TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt),
-           TP_ARGS(dd, uctxt),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(unsigned, ctxt)
-                            __field(u32, credits)
-                            __field(u64, hw_free)
-                            __field(u64, piobase)
-                            __field(u16, rcvhdrq_cnt)
-                            __field(u64, rcvhdrq_phys)
-                            __field(u32, eager_cnt)
-                            __field(u64, rcvegr_phys)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->ctxt = uctxt->ctxt;
-                          __entry->credits = uctxt->sc->credits;
-                          __entry->hw_free = (u64)uctxt->sc->hw_free;
-                          __entry->piobase = (u64)uctxt->sc->base_addr;
-                          __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
-                          __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys;
-                          __entry->eager_cnt = uctxt->egrbufs.alloced;
-                          __entry->rcvegr_phys =
-                          uctxt->egrbufs.rcvtids[0].phys;
-                          ),
-           TP_printk("[%s] ctxt %u " UCTXT_FMT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->credits,
-                     __entry->hw_free,
-                     __entry->piobase,
-                     __entry->rcvhdrq_cnt,
-                     __entry->rcvhdrq_phys,
-                     __entry->eager_cnt,
-                     __entry->rcvegr_phys
-                     )
-);
-
-#define CINFO_FMT \
-       "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
-TRACE_EVENT(hfi1_ctxt_info,
-           TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt,
-                    struct hfi1_ctxt_info cinfo),
-           TP_ARGS(dd, ctxt, subctxt, cinfo),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(unsigned, ctxt)
-                            __field(unsigned, subctxt)
-                            __field(u16, egrtids)
-                            __field(u16, rcvhdrq_cnt)
-                            __field(u16, rcvhdrq_size)
-                            __field(u16, sdma_ring_size)
-                            __field(u32, rcvegr_size)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                           __entry->ctxt = ctxt;
-                           __entry->subctxt = subctxt;
-                           __entry->egrtids = cinfo.egrtids;
-                           __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
-                           __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
-                           __entry->sdma_ring_size = cinfo.sdma_ring_size;
-                           __entry->rcvegr_size = cinfo.rcvegr_size;
-                           ),
-           TP_printk("[%s] ctxt %u:%u " CINFO_FMT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->egrtids,
-                     __entry->rcvegr_size,
-                     __entry->rcvhdrq_cnt,
-                     __entry->rcvhdrq_size,
-                     __entry->sdma_ring_size
-                     )
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_sma
-
-#define BCT_FORMAT \
-       "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
-
-#define BCT(field) \
-       be16_to_cpu( \
-               ((struct buffer_control *)__get_dynamic_array(bct))->field \
-       )
-
-DECLARE_EVENT_CLASS(hfi1_bct_template,
-                   TP_PROTO(struct hfi1_devdata *dd,
-                            struct buffer_control *bc),
-                   TP_ARGS(dd, bc),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                                    __dynamic_array(u8, bct, sizeof(*bc))
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(dd);
-                                  memcpy(__get_dynamic_array(bct), bc,
-                                         sizeof(*bc));
-                                  ),
-                   TP_printk(BCT_FORMAT,
-                             BCT(overall_shared_limit),
-
-                             BCT(vl[0].dedicated),
-                             BCT(vl[0].shared),
-
-                             BCT(vl[1].dedicated),
-                             BCT(vl[1].shared),
-
-                             BCT(vl[2].dedicated),
-                             BCT(vl[2].shared),
-
-                             BCT(vl[3].dedicated),
-                             BCT(vl[3].shared),
-
-                             BCT(vl[4].dedicated),
-                             BCT(vl[4].shared),
-
-                             BCT(vl[5].dedicated),
-                             BCT(vl[5].shared),
-
-                             BCT(vl[6].dedicated),
-                             BCT(vl[6].shared),
-
-                             BCT(vl[7].dedicated),
-                             BCT(vl[7].shared),
-
-                             BCT(vl[15].dedicated),
-                             BCT(vl[15].shared)
-                             )
-);
-
-DEFINE_EVENT(hfi1_bct_template, bct_set,
-            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
-            TP_ARGS(dd, bc));
-
-DEFINE_EVENT(hfi1_bct_template, bct_get,
-            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
-            TP_ARGS(dd, bc));
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_sdma
-
-TRACE_EVENT(hfi1_sdma_descriptor,
-           TP_PROTO(struct sdma_engine *sde,
-                    u64 desc0,
-                    u64 desc1,
-                    u16 e,
-                    void *descp),
-       TP_ARGS(sde, desc0, desc1, e, descp),
-       TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                        __field(void *, descp)
-                        __field(u64, desc0)
-                        __field(u64, desc1)
-                        __field(u16, e)
-                        __field(u8, idx)
-                        ),
-       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                      __entry->desc0 = desc0;
-                      __entry->desc1 = desc1;
-                      __entry->idx = sde->this_idx;
-                      __entry->descp = descp;
-                      __entry->e = e;
-                      ),
-       TP_printk(
-                 "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
-                 __get_str(dev),
-                 __entry->idx,
-                 __parse_sdma_flags(__entry->desc0, __entry->desc1),
-                 (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) &
-                 SDMA_DESC0_PHY_ADDR_MASK,
-                 (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) &
-                      SDMA_DESC1_GENERATION_MASK),
-                 (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) &
-                       SDMA_DESC0_BYTE_COUNT_MASK),
-                 __entry->desc0,
-                 __entry->desc1,
-                 __entry->descp,
-                 __entry->e
-                 )
-);
-
-TRACE_EVENT(hfi1_sdma_engine_select,
-           TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
-           TP_ARGS(dd, sel, vl, idx),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(u32, sel)
-                            __field(u8, vl)
-                            __field(u8, idx)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->sel = sel;
-                          __entry->vl = vl;
-                          __entry->idx = idx;
-                          ),
-           TP_printk("[%s] selecting SDE %u sel 0x%x vl %u",
-                     __get_str(dev),
-                     __entry->idx,
-                     __entry->sel,
-                     __entry->vl
-                     )
-);
-
-DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
-                   TP_PROTO(struct sdma_engine *sde, u64 status),
-                   TP_ARGS(sde, status),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                                    __field(u64, status)
-                                    __field(u8, idx)
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                                  __entry->status = status;
-                                  __entry->idx = sde->this_idx;
-                                  ),
-                   TP_printk("[%s] SDE(%u) status %llx",
-                             __get_str(dev),
-                             __entry->idx,
-                             (unsigned long long)__entry->status
-                             )
-);
-
-DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
-            TP_PROTO(struct sdma_engine *sde, u64 status),
-            TP_ARGS(sde, status)
-);
-
-DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
-            TP_PROTO(struct sdma_engine *sde, u64 status),
-            TP_ARGS(sde, status)
-);
-
-DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
-                   TP_PROTO(struct sdma_engine *sde, int aidx),
-                   TP_ARGS(sde, aidx),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                                    __field(int, aidx)
-                                    __field(u8, idx)
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                                  __entry->idx = sde->this_idx;
-                                  __entry->aidx = aidx;
-                                  ),
-                   TP_printk("[%s] SDE(%u) aidx %d",
-                             __get_str(dev),
-                             __entry->idx,
-                             __entry->aidx
-                             )
-);
-
-DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
-            TP_PROTO(struct sdma_engine *sde, int aidx),
-            TP_ARGS(sde, aidx));
-
-DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
-            TP_PROTO(struct sdma_engine *sde, int aidx),
-            TP_ARGS(sde, aidx));
-
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-TRACE_EVENT(hfi1_sdma_progress,
-           TP_PROTO(struct sdma_engine *sde,
-                    u16 hwhead,
-                    u16 swhead,
-                    struct sdma_txreq *txp
-                    ),
-           TP_ARGS(sde, hwhead, swhead, txp),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                            __field(u64, sn)
-                            __field(u16, hwhead)
-                            __field(u16, swhead)
-                            __field(u16, txnext)
-                            __field(u16, tx_tail)
-                            __field(u16, tx_head)
-                            __field(u8, idx)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                          __entry->hwhead = hwhead;
-                          __entry->swhead = swhead;
-                          __entry->tx_tail = sde->tx_tail;
-                          __entry->tx_head = sde->tx_head;
-                          __entry->txnext = txp ? txp->next_descq_idx : ~0;
-                          __entry->idx = sde->this_idx;
-                          __entry->sn = txp ? txp->sn : ~0;
-                          ),
-           TP_printk(
-                     "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
-                     __get_str(dev),
-                     __entry->idx,
-                     __entry->sn,
-                     __entry->hwhead,
-                     __entry->swhead,
-                     __entry->txnext,
-                     __entry->tx_head,
-                     __entry->tx_tail
-                     )
-);
-#else
-TRACE_EVENT(hfi1_sdma_progress,
-           TP_PROTO(struct sdma_engine *sde,
-                    u16 hwhead, u16 swhead,
-                    struct sdma_txreq *txp
-           ),
-       TP_ARGS(sde, hwhead, swhead, txp),
-       TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                        __field(u16, hwhead)
-                        __field(u16, swhead)
-                        __field(u16, txnext)
-                        __field(u16, tx_tail)
-                        __field(u16, tx_head)
-                        __field(u8, idx)
-                        ),
-       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                      __entry->hwhead = hwhead;
-                      __entry->swhead = swhead;
-                      __entry->tx_tail = sde->tx_tail;
-                      __entry->tx_head = sde->tx_head;
-                      __entry->txnext = txp ? txp->next_descq_idx : ~0;
-                      __entry->idx = sde->this_idx;
-                      ),
-       TP_printk(
-                 "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
-                 __get_str(dev),
-                 __entry->idx,
-                 __entry->hwhead,
-                 __entry->swhead,
-                 __entry->txnext,
-                 __entry->tx_head,
-                 __entry->tx_tail
-                 )
-);
-#endif
-
-DECLARE_EVENT_CLASS(hfi1_sdma_sn,
-                   TP_PROTO(struct sdma_engine *sde, u64 sn),
-                   TP_ARGS(sde, sn),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                                    __field(u64, sn)
-                                    __field(u8, idx)
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                                  __entry->sn = sn;
-                                  __entry->idx = sde->this_idx;
-                                  ),
-                   TP_printk("[%s] SDE(%u) sn %llu",
-                             __get_str(dev),
-                             __entry->idx,
-                             __entry->sn
-                             )
-);
-
-DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
-            TP_PROTO(
-               struct sdma_engine *sde,
-               u64 sn
-            ),
-            TP_ARGS(sde, sn)
-);
-
-DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
-            TP_PROTO(struct sdma_engine *sde, u64 sn),
-            TP_ARGS(sde, sn)
-);
-
-#define USDMA_HDR_FORMAT \
-       "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
-
-TRACE_EVENT(hfi1_sdma_user_header,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
-                    struct hfi1_pkt_header *hdr, u32 tidval),
-           TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd)
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u16, req)
-                   __field(__le32, pbc0)
-                   __field(__le32, pbc1)
-                   __field(__be32, lrh0)
-                   __field(__be32, lrh1)
-                   __field(__be32, bth0)
-                   __field(__be32, bth1)
-                   __field(__be32, bth2)
-                   __field(__le32, kdeth0)
-                   __field(__le32, kdeth1)
-                   __field(__le32, kdeth2)
-                   __field(__le32, kdeth3)
-                   __field(__le32, kdeth4)
-                   __field(__le32, kdeth5)
-                   __field(__le32, kdeth6)
-                   __field(__le32, kdeth7)
-                   __field(__le32, kdeth8)
-                   __field(u32, tidval)
-                   ),
-           TP_fast_assign(
-                   __le32 *pbc = (__le32 *)hdr->pbc;
-                   __be32 *lrh = (__be32 *)hdr->lrh;
-                   __be32 *bth = (__be32 *)hdr->bth;
-                   __le32 *kdeth = (__le32 *)&hdr->kdeth;
-
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->req = req;
-                   __entry->pbc0 = pbc[0];
-                   __entry->pbc1 = pbc[1];
-                   __entry->lrh0 = be32_to_cpu(lrh[0]);
-                   __entry->lrh1 = be32_to_cpu(lrh[1]);
-                   __entry->bth0 = be32_to_cpu(bth[0]);
-                   __entry->bth1 = be32_to_cpu(bth[1]);
-                   __entry->bth2 = be32_to_cpu(bth[2]);
-                   __entry->kdeth0 = kdeth[0];
-                   __entry->kdeth1 = kdeth[1];
-                   __entry->kdeth2 = kdeth[2];
-                   __entry->kdeth3 = kdeth[3];
-                   __entry->kdeth4 = kdeth[4];
-                   __entry->kdeth5 = kdeth[5];
-                   __entry->kdeth6 = kdeth[6];
-                   __entry->kdeth7 = kdeth[7];
-                   __entry->kdeth8 = kdeth[8];
-                   __entry->tidval = tidval;
-                   ),
-           TP_printk(USDMA_HDR_FORMAT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->req,
-                     __entry->pbc1,
-                     __entry->pbc0,
-                     __entry->lrh0,
-                     __entry->lrh1,
-                     __entry->bth0,
-                     __entry->bth1,
-                     __entry->bth2,
-                     __entry->kdeth0,
-                     __entry->kdeth1,
-                     __entry->kdeth2,
-                     __entry->kdeth3,
-                     __entry->kdeth4,
-                     __entry->kdeth5,
-                     __entry->kdeth6,
-                     __entry->kdeth7,
-                     __entry->kdeth8,
-                     __entry->tidval
-                   )
-       );
-
-#define SDMA_UREQ_FMT \
-       "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
-TRACE_EVENT(hfi1_sdma_user_reqinfo,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
-           TP_ARGS(dd, ctxt, subctxt, i),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd);
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u8, ver_opcode)
-                   __field(u8, iovcnt)
-                   __field(u16, npkts)
-                   __field(u16, fragsize)
-                   __field(u16, comp_idx)
-                   ),
-           TP_fast_assign(
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->ver_opcode = i[0] & 0xff;
-                   __entry->iovcnt = (i[0] >> 8) & 0xff;
-                   __entry->npkts = i[1];
-                   __entry->fragsize = i[2];
-                   __entry->comp_idx = i[3];
-                   ),
-           TP_printk(SDMA_UREQ_FMT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->ver_opcode,
-                     __entry->iovcnt,
-                     __entry->npkts,
-                     __entry->fragsize,
-                     __entry->comp_idx
-                   )
-       );
-
-#define usdma_complete_name(st) { st, #st }
-#define show_usdma_complete_state(st)                  \
-       __print_symbolic(st,                            \
-                        usdma_complete_name(FREE),     \
-                        usdma_complete_name(QUEUED),   \
-                        usdma_complete_name(COMPLETE), \
-                        usdma_complete_name(ERROR))
-
-TRACE_EVENT(hfi1_sdma_user_completion,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
-                    u8 state, int code),
-           TP_ARGS(dd, ctxt, subctxt, idx, state, code),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd)
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u16, idx)
-                   __field(u8, state)
-                   __field(int, code)
-                   ),
-           TP_fast_assign(
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->idx = idx;
-                   __entry->state = state;
-                   __entry->code = code;
-                   ),
-           TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
-                     __get_str(dev), __entry->ctxt, __entry->subctxt,
-                     __entry->idx, show_usdma_complete_state(__entry->state),
-                     __entry->code)
-       );
-
-const char *print_u32_array(struct trace_seq *, u32 *, int);
-#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
-
-TRACE_EVENT(hfi1_sdma_user_header_ahg,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
-                    u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
-           TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd)
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u16, req)
-                   __field(u8, sde)
-                   __field(u8, idx)
-                   __field(int, len)
-                   __field(u32, tidval)
-                   __array(u32, ahg, 10)
-                   ),
-           TP_fast_assign(
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->req = req;
-                   __entry->sde = sde;
-                   __entry->idx = ahgidx;
-                   __entry->len = len;
-                   __entry->tidval = tidval;
-                   memcpy(__entry->ahg, ahg, len * sizeof(u32));
-                   ),
-           TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->req,
-                     __entry->sde,
-                     __entry->idx,
-                     __entry->len - 1,
-                     __print_u32_hex(__entry->ahg, __entry->len),
-                     __entry->tidval
-                   )
-       );
-
-TRACE_EVENT(hfi1_sdma_state,
-           TP_PROTO(struct sdma_engine *sde,
-                    const char *cstate,
-                    const char *nstate
-                    ),
-           TP_ARGS(sde, cstate, nstate),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                            __string(curstate, cstate)
-                            __string(newstate, nstate)
-                            ),
-       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                      __assign_str(curstate, cstate);
-                      __assign_str(newstate, nstate);
-                      ),
-       TP_printk("[%s] current state %s new state %s",
-                 __get_str(dev),
-                 __get_str(curstate),
-                 __get_str(newstate)
-                 )
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_rc
-
-DECLARE_EVENT_CLASS(hfi1_rc_template,
-                   TP_PROTO(struct rvt_qp *qp, u32 psn),
-                   TP_ARGS(qp, psn),
-                   TP_STRUCT__entry(
-                       DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
-                       __field(u32, qpn)
-                       __field(u32, s_flags)
-                       __field(u32, psn)
-                       __field(u32, s_psn)
-                       __field(u32, s_next_psn)
-                       __field(u32, s_sending_psn)
-                       __field(u32, s_sending_hpsn)
-                       __field(u32, r_psn)
-                       ),
-                   TP_fast_assign(
-                       DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
-                       __entry->qpn = qp->ibqp.qp_num;
-                       __entry->s_flags = qp->s_flags;
-                       __entry->psn = psn;
-                       __entry->s_psn = qp->s_psn;
-                       __entry->s_next_psn = qp->s_next_psn;
-                       __entry->s_sending_psn = qp->s_sending_psn;
-                       __entry->s_sending_hpsn = qp->s_sending_hpsn;
-                       __entry->r_psn = qp->r_psn;
-                       ),
-                   TP_printk(
-                       "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x",
-                       __get_str(dev),
-                       __entry->qpn,
-                       __entry->s_flags,
-                       __entry->psn,
-                       __entry->s_psn,
-                       __entry->s_next_psn,
-                       __entry->s_sending_psn,
-                       __entry->s_sending_hpsn,
-                       __entry->r_psn
-                       )
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_sendcomplete,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_ack,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_timeout,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_rcv_error,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_misc
-
-TRACE_EVENT(hfi1_interrupt,
-           TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
-                    int src),
-           TP_ARGS(dd, is_entry, src),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __array(char, buf, 64)
-                            __field(int, src)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd)
-                          is_entry->is_name(__entry->buf, 64,
-                                            src - is_entry->start);
-                          __entry->src = src;
-                          ),
-           TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
-                     __entry->src)
-);
-
-/*
- * Note:
- * This produces a REALLY ugly trace in the console output when the string is
- * too long.
- */
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_trace
-
-#define MAX_MSG_LEN 512
-
-DECLARE_EVENT_CLASS(hfi1_trace_template,
-                   TP_PROTO(const char *function, struct va_format *vaf),
-                   TP_ARGS(function, vaf),
-                   TP_STRUCT__entry(__string(function, function)
-                                    __dynamic_array(char, msg, MAX_MSG_LEN)
-                                    ),
-                   TP_fast_assign(__assign_str(function, function);
-                                  WARN_ON_ONCE(vsnprintf
-                                               (__get_dynamic_array(msg),
-                                                MAX_MSG_LEN, vaf->fmt,
-                                                *vaf->va) >=
-                                               MAX_MSG_LEN);
-                                  ),
-                   TP_printk("(%s) %s",
-                             __get_str(function),
-                             __get_str(msg))
-);
-
-/*
- * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
- * actual function to work and can not be in a macro.
- */
-#define __hfi1_trace_def(lvl) \
-void __hfi1_trace_##lvl(const char *funct, char *fmt, ...);            \
-                                                                       \
-DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl,                         \
-       TP_PROTO(const char *function, struct va_format *vaf),          \
-       TP_ARGS(function, vaf))
-
-#define __hfi1_trace_fn(lvl) \
-void __hfi1_trace_##lvl(const char *func, char *fmt, ...)              \
-{                                                                      \
-       struct va_format vaf = {                                        \
-               .fmt = fmt,                                             \
-       };                                                              \
-       va_list args;                                                   \
-                                                                       \
-       va_start(args, fmt);                                            \
-       vaf.va = &args;                                                 \
-       trace_hfi1_ ##lvl(func, &vaf);                                  \
-       va_end(args);                                                   \
-       return;                                                         \
-}
-
-/*
- * To create a new trace level simply define it below and as a __hfi1_trace_fn
- * in trace.c. This will create all the hooks for calling
- * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
- * the debugfs stuff.
- */
-__hfi1_trace_def(PKT);
-__hfi1_trace_def(PROC);
-__hfi1_trace_def(SDMA);
-__hfi1_trace_def(LINKVERB);
-__hfi1_trace_def(DEBUG);
-__hfi1_trace_def(SNOOP);
-__hfi1_trace_def(CNTR);
-__hfi1_trace_def(PIO);
-__hfi1_trace_def(DC8051);
-__hfi1_trace_def(FIRMWARE);
-__hfi1_trace_def(RCVCTRL);
-__hfi1_trace_def(TID);
-__hfi1_trace_def(MMU);
-__hfi1_trace_def(IOCTL);
-
-#define hfi1_cdbg(which, fmt, ...) \
-       __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
-
-#define hfi1_dbg(fmt, ...) \
-       hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
-
-/*
- * Define HFI1_EARLY_DBG at compile time or here to enable early trace
- * messages. Do not check in an enablement for this.
- */
-
-#ifdef HFI1_EARLY_DBG
-#define hfi1_dbg_early(fmt, ...) \
-       trace_printk(fmt, ##__VA_ARGS__)
-#else
-#define hfi1_dbg_early(fmt, ...)
-#endif
-
-#endif /* __HFI1_TRACE_H */
-
-#undef TRACE_INCLUDE_PATH
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE trace
-#include <trace/define_trace.h>
+#include "trace_dbg.h"
+#include "trace_misc.h"
+#include "trace_ctxts.h"
+#include "trace_ibhdrs.h"
+#include "trace_rc.h"
+#include "trace_rx.h"
+#include "trace_tx.h"
diff --git a/drivers/infiniband/hw/hfi1/trace_ctxts.h b/drivers/infiniband/hw/hfi1/trace_ctxts.h

new file mode 100644 (file)

index 0000000..31654bb
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_ctxts.h
@@ -0,0 +1,141 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license.  When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+*  - Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+*  - Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in
+*    the documentation and/or other materials provided with the
+*    distribution.
+*  - Neither the name of Intel Corporation nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_CTXTS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_CTXTS_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ctxts
+
+#define UCTXT_FMT \
+       "cred:%u, credaddr:0x%llx, piobase:0x%p, rcvhdr_cnt:%u, "       \
+       "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx"
+TRACE_EVENT(hfi1_uctxtdata,
+           TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt),
+           TP_ARGS(dd, uctxt),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(unsigned int, ctxt)
+                            __field(u32, credits)
+                            __field(u64, hw_free)
+                            __field(void __iomem *, piobase)
+                            __field(u16, rcvhdrq_cnt)
+                            __field(u64, rcvhdrq_phys)
+                            __field(u32, eager_cnt)
+                            __field(u64, rcvegr_phys)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                          __entry->ctxt = uctxt->ctxt;
+                          __entry->credits = uctxt->sc->credits;
+                          __entry->hw_free = le64_to_cpu(*uctxt->sc->hw_free);
+                          __entry->piobase = uctxt->sc->base_addr;
+                          __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+                          __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys;
+                          __entry->eager_cnt = uctxt->egrbufs.alloced;
+                          __entry->rcvegr_phys =
+                          uctxt->egrbufs.rcvtids[0].phys;
+                          ),
+           TP_printk("[%s] ctxt %u " UCTXT_FMT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->credits,
+                     __entry->hw_free,
+                     __entry->piobase,
+                     __entry->rcvhdrq_cnt,
+                     __entry->rcvhdrq_phys,
+                     __entry->eager_cnt,
+                     __entry->rcvegr_phys
+                     )
+);
+
+#define CINFO_FMT \
+       "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
+TRACE_EVENT(hfi1_ctxt_info,
+           TP_PROTO(struct hfi1_devdata *dd, unsigned int ctxt,
+                    unsigned int subctxt,
+                    struct hfi1_ctxt_info cinfo),
+           TP_ARGS(dd, ctxt, subctxt, cinfo),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(unsigned int, ctxt)
+                            __field(unsigned int, subctxt)
+                            __field(u16, egrtids)
+                            __field(u16, rcvhdrq_cnt)
+                            __field(u16, rcvhdrq_size)
+                            __field(u16, sdma_ring_size)
+                            __field(u32, rcvegr_size)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                           __entry->ctxt = ctxt;
+                           __entry->subctxt = subctxt;
+                           __entry->egrtids = cinfo.egrtids;
+                           __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
+                           __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
+                           __entry->sdma_ring_size = cinfo.sdma_ring_size;
+                           __entry->rcvegr_size = cinfo.rcvegr_size;
+                           ),
+           TP_printk("[%s] ctxt %u:%u " CINFO_FMT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->egrtids,
+                     __entry->rcvegr_size,
+                     __entry->rcvhdrq_cnt,
+                     __entry->rcvhdrq_size,
+                     __entry->sdma_ring_size
+                     )
+);
+
+#endif /* __HFI1_TRACE_CTXTS_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_ctxts
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h

new file mode 100644 (file)

index 0000000..0e7d929
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_dbg.h
@@ -0,0 +1,155 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license.  When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+*  - Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+*  - Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in
+*    the documentation and/or other materials provided with the
+*    distribution.
+*  - Neither the name of Intel Corporation nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_EXTRA_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_EXTRA_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+/*
+ * Note:
+ * This produces a REALLY ugly trace in the console output when the string is
+ * too long.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_dbg
+
+#define MAX_MSG_LEN 512
+
+DECLARE_EVENT_CLASS(hfi1_trace_template,
+                   TP_PROTO(const char *function, struct va_format *vaf),
+                   TP_ARGS(function, vaf),
+                   TP_STRUCT__entry(__string(function, function)
+                                    __dynamic_array(char, msg, MAX_MSG_LEN)
+                                    ),
+                   TP_fast_assign(__assign_str(function, function);
+                                  WARN_ON_ONCE(vsnprintf
+                                               (__get_dynamic_array(msg),
+                                                MAX_MSG_LEN, vaf->fmt,
+                                                *vaf->va) >=
+                                               MAX_MSG_LEN);
+                                  ),
+                   TP_printk("(%s) %s",
+                             __get_str(function),
+                             __get_str(msg))
+);
+
+/*
+ * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
+ * actual function to work and can not be in a macro.
+ */
+#define __hfi1_trace_def(lvl) \
+void __hfi1_trace_##lvl(const char *funct, char *fmt, ...);            \
+                                                                       \
+DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl,                         \
+       TP_PROTO(const char *function, struct va_format *vaf),          \
+       TP_ARGS(function, vaf))
+
+#define __hfi1_trace_fn(lvl) \
+void __hfi1_trace_##lvl(const char *func, char *fmt, ...)              \
+{                                                                      \
+       struct va_format vaf = {                                        \
+               .fmt = fmt,                                             \
+       };                                                              \
+       va_list args;                                                   \
+                                                                       \
+       va_start(args, fmt);                                            \
+       vaf.va = &args;                                                 \
+       trace_hfi1_ ##lvl(func, &vaf);                                  \
+       va_end(args);                                                   \
+       return;                                                         \
+}
+
+/*
+ * To create a new trace level simply define it below and as a __hfi1_trace_fn
+ * in trace.c. This will create all the hooks for calling
+ * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
+ * the debugfs stuff.
+ */
+__hfi1_trace_def(PKT);
+__hfi1_trace_def(PROC);
+__hfi1_trace_def(SDMA);
+__hfi1_trace_def(LINKVERB);
+__hfi1_trace_def(DEBUG);
+__hfi1_trace_def(SNOOP);
+__hfi1_trace_def(CNTR);
+__hfi1_trace_def(PIO);
+__hfi1_trace_def(DC8051);
+__hfi1_trace_def(FIRMWARE);
+__hfi1_trace_def(RCVCTRL);
+__hfi1_trace_def(TID);
+__hfi1_trace_def(MMU);
+__hfi1_trace_def(IOCTL);
+
+#define hfi1_cdbg(which, fmt, ...) \
+       __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
+
+#define hfi1_dbg(fmt, ...) \
+       hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
+
+/*
+ * Define HFI1_EARLY_DBG at compile time or here to enable early trace
+ * messages. Do not check in an enablement for this.
+ */
+
+#ifdef HFI1_EARLY_DBG
+#define hfi1_dbg_early(fmt, ...) \
+       trace_printk(fmt, ##__VA_ARGS__)
+#else
+#define hfi1_dbg_early(fmt, ...)
+#endif
+
+#endif /* __HFI1_TRACE_EXTRA_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_dbg
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h

new file mode 100644 (file)

index 0000000..c3e41ae
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__HFI1_TRACE_IBHDRS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_IBHDRS_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ibhdrs
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr);
+const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs);
+
+#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
+
+#define lrh_name(lrh) { HFI1_##lrh, #lrh }
+#define show_lnh(lrh)                    \
+__print_symbolic(lrh,                    \
+       lrh_name(LRH_BTH),               \
+       lrh_name(LRH_GRH))
+
+#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
+#define BTH_PRN \
+       "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
+       "f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
+#define EHDR_PRN "%s"
+
+DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
+                   TP_PROTO(struct hfi1_devdata *dd,
+                            struct hfi1_ib_header *hdr),
+                   TP_ARGS(dd, hdr),
+                   TP_STRUCT__entry(
+                       DD_DEV_ENTRY(dd)
+                       /* LRH */
+                       __field(u8, vl)
+                       __field(u8, lver)
+                       __field(u8, sl)
+                       __field(u8, lnh)
+                       __field(u16, dlid)
+                       __field(u16, len)
+                       __field(u16, slid)
+                       /* BTH */
+                       __field(u8, opcode)
+                       __field(u8, se)
+                       __field(u8, m)
+                       __field(u8, pad)
+                       __field(u8, tver)
+                       __field(u16, pkey)
+                       __field(u8, f)
+                       __field(u8, b)
+                       __field(u32, qpn)
+                       __field(u8, a)
+                       __field(u32, psn)
+                       /* extended headers */
+                       __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
+                       ),
+                     TP_fast_assign(
+                       struct hfi1_other_headers *ohdr;
+
+                       DD_DEV_ASSIGN(dd);
+                       /* LRH */
+                       __entry->vl =
+                       (u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
+                       __entry->lver =
+                       (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
+                       __entry->sl =
+                       (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+                       __entry->lnh =
+                       (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+                       __entry->dlid =
+                       be16_to_cpu(hdr->lrh[1]);
+                       /* allow for larger len */
+                       __entry->len =
+                       be16_to_cpu(hdr->lrh[2]);
+                       __entry->slid =
+                       be16_to_cpu(hdr->lrh[3]);
+                       /* BTH */
+                       if (__entry->lnh == HFI1_LRH_BTH)
+                       ohdr = &hdr->u.oth;
+                       else
+                       ohdr = &hdr->u.l.oth;
+                       __entry->opcode =
+                       (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+                       __entry->se =
+                       (be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
+                       __entry->m =
+                       (be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
+                       __entry->pad =
+                       (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+                       __entry->tver =
+                       (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
+                       __entry->pkey =
+                       be32_to_cpu(ohdr->bth[0]) & 0xffff;
+                       __entry->f =
+                       (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) &
+                       HFI1_FECN_MASK;
+                       __entry->b =
+                       (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) &
+                       HFI1_BECN_MASK;
+                       __entry->qpn =
+                       be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+                       __entry->a =
+                       (be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
+                       /* allow for larger PSN */
+                       __entry->psn =
+                       be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
+                       /* extended headers */
+                       memcpy(__get_dynamic_array(ehdrs), &ohdr->u,
+                              ibhdr_exhdr_len(hdr));
+                       ),
+               TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
+                         __get_str(dev),
+                         /* LRH */
+                         __entry->vl,
+                         __entry->lver,
+                         __entry->sl,
+                         __entry->lnh, show_lnh(__entry->lnh),
+                         __entry->dlid,
+                         __entry->len,
+                         __entry->slid,
+                         /* BTH */
+                         __entry->opcode, show_ib_opcode(__entry->opcode),
+                         __entry->se,
+                         __entry->m,
+                         __entry->pad,
+                         __entry->tver,
+                         __entry->pkey,
+                         __entry->f,
+                         __entry->b,
+                         __entry->qpn,
+                         __entry->a,
+                         __entry->psn,
+                         /* extended headers */
+                         __parse_ib_ehdrs(
+                               __entry->opcode,
+                               (void *)__get_dynamic_array(ehdrs))
+                       )
+);
+
+DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+#endif /* __HFI1_TRACE_IBHDRS_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_ibhdrs
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_misc.h b/drivers/infiniband/hw/hfi1/trace_misc.h

new file mode 100644 (file)

index 0000000..d308454
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_misc.h
@@ -0,0 +1,81 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license.  When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+*  - Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+*  - Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in
+*    the documentation and/or other materials provided with the
+*    distribution.
+*  - Neither the name of Intel Corporation nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_MISC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_MISC_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_misc
+
+TRACE_EVENT(hfi1_interrupt,
+           TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
+                    int src),
+           TP_ARGS(dd, is_entry, src),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __array(char, buf, 64)
+                            __field(int, src)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd)
+                          is_entry->is_name(__entry->buf, 64,
+                                            src - is_entry->start);
+                          __entry->src = src;
+                          ),
+           TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
+                     __entry->src)
+);
+
+#endif /* __HFI1_TRACE_MISC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_misc
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h

new file mode 100644 (file)

index 0000000..5ea5005
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_rc.h
@@ -0,0 +1,123 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license.  When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+*  - Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+*  - Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in
+*    the documentation and/or other materials provided with the
+*    distribution.
+*  - Neither the name of Intel Corporation nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_RC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_RC_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rc
+
+DECLARE_EVENT_CLASS(hfi1_rc_template,
+                   TP_PROTO(struct rvt_qp *qp, u32 psn),
+                   TP_ARGS(qp, psn),
+                   TP_STRUCT__entry(
+                       DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+                       __field(u32, qpn)
+                       __field(u32, s_flags)
+                       __field(u32, psn)
+                       __field(u32, s_psn)
+                       __field(u32, s_next_psn)
+                       __field(u32, s_sending_psn)
+                       __field(u32, s_sending_hpsn)
+                       __field(u32, r_psn)
+                       ),
+                   TP_fast_assign(
+                       DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+                       __entry->qpn = qp->ibqp.qp_num;
+                       __entry->s_flags = qp->s_flags;
+                       __entry->psn = psn;
+                       __entry->s_psn = qp->s_psn;
+                       __entry->s_next_psn = qp->s_next_psn;
+                       __entry->s_sending_psn = qp->s_sending_psn;
+                       __entry->s_sending_hpsn = qp->s_sending_hpsn;
+                       __entry->r_psn = qp->r_psn;
+                       ),
+                   TP_printk(
+                       "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x",
+                       __get_str(dev),
+                       __entry->qpn,
+                       __entry->s_flags,
+                       __entry->psn,
+                       __entry->s_psn,
+                       __entry->s_next_psn,
+                       __entry->s_sending_psn,
+                       __entry->s_sending_hpsn,
+                       __entry->r_psn
+                       )
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_sendcomplete,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_ack,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_timeout,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_rcv_error,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+#endif /* __HFI1_TRACE_RC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_rc
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h

new file mode 100644 (file)

index 0000000..9ba1f61
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_rx.h
@@ -0,0 +1,322 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__HFI1_TRACE_RX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_RX_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rx
+
+TRACE_EVENT(hfi1_rcvhdr,
+           TP_PROTO(struct hfi1_devdata *dd,
+                    u32 ctxt,
+                    u64 eflags,
+                    u32 etype,
+                    u32 hlen,
+                    u32 tlen,
+                    u32 updegr,
+                    u32 etail
+                   ),
+           TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(u64, eflags)
+                            __field(u32, ctxt)
+                            __field(u32, etype)
+                            __field(u32, hlen)
+                            __field(u32, tlen)
+                            __field(u32, updegr)
+                            __field(u32, etail)
+                            ),
+            TP_fast_assign(DD_DEV_ASSIGN(dd);
+                           __entry->eflags = eflags;
+                           __entry->ctxt = ctxt;
+                           __entry->etype = etype;
+                           __entry->hlen = hlen;
+                           __entry->tlen = tlen;
+                           __entry->updegr = updegr;
+                           __entry->etail = etail;
+                           ),
+            TP_printk(
+               "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
+               __get_str(dev),
+               __entry->ctxt,
+               __entry->eflags,
+               __entry->etype, show_packettype(__entry->etype),
+               __entry->hlen,
+               __entry->tlen,
+               __entry->updegr,
+               __entry->etail
+               )
+);
+
+TRACE_EVENT(hfi1_receive_interrupt,
+           TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
+           TP_ARGS(dd, ctxt),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(u32, ctxt)
+                            __field(u8, slow_path)
+                            __field(u8, dma_rtail)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                       __entry->ctxt = ctxt;
+                       if (dd->rcd[ctxt]->do_interrupt ==
+                           &handle_receive_interrupt) {
+                               __entry->slow_path = 1;
+                               __entry->dma_rtail = 0xFF;
+                       } else if (dd->rcd[ctxt]->do_interrupt ==
+                                       &handle_receive_interrupt_dma_rtail){
+                               __entry->dma_rtail = 1;
+                               __entry->slow_path = 0;
+                       } else if (dd->rcd[ctxt]->do_interrupt ==
+                                       &handle_receive_interrupt_nodma_rtail) {
+                               __entry->dma_rtail = 0;
+                               __entry->slow_path = 0;
+                       }
+                       ),
+           TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d",
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->slow_path,
+                     __entry->dma_rtail
+                     )
+);
+
+TRACE_EVENT(hfi1_exp_tid_reg,
+           TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr,
+                    u32 npages, unsigned long va, unsigned long pa,
+                    dma_addr_t dma),
+           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+           TP_STRUCT__entry(
+                            __field(unsigned int, ctxt)
+                            __field(u16, subctxt)
+                            __field(u32, rarr)
+                            __field(u32, npages)
+                            __field(unsigned long, va)
+                            __field(unsigned long, pa)
+                            __field(dma_addr_t, dma)
+                            ),
+           TP_fast_assign(
+                          __entry->ctxt = ctxt;
+                          __entry->subctxt = subctxt;
+                          __entry->rarr = rarr;
+                          __entry->npages = npages;
+                          __entry->va = va;
+                          __entry->pa = pa;
+                          __entry->dma = dma;
+                          ),
+           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->rarr,
+                     __entry->npages,
+                     __entry->pa,
+                     __entry->va,
+                     __entry->dma
+                     )
+       );
+
+TRACE_EVENT(hfi1_exp_tid_unreg,
+           TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+                    unsigned long va, unsigned long pa, dma_addr_t dma),
+           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+           TP_STRUCT__entry(
+                            __field(unsigned int, ctxt)
+                            __field(u16, subctxt)
+                            __field(u32, rarr)
+                            __field(u32, npages)
+                            __field(unsigned long, va)
+                            __field(unsigned long, pa)
+                            __field(dma_addr_t, dma)
+                            ),
+           TP_fast_assign(
+                          __entry->ctxt = ctxt;
+                          __entry->subctxt = subctxt;
+                          __entry->rarr = rarr;
+                          __entry->npages = npages;
+                          __entry->va = va;
+                          __entry->pa = pa;
+                          __entry->dma = dma;
+                          ),
+           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->rarr,
+                     __entry->npages,
+                     __entry->pa,
+                     __entry->va,
+                     __entry->dma
+                     )
+       );
+
+TRACE_EVENT(hfi1_exp_tid_inval,
+           TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
+                    u32 npages, dma_addr_t dma),
+           TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
+           TP_STRUCT__entry(
+                            __field(unsigned int, ctxt)
+                            __field(u16, subctxt)
+                            __field(unsigned long, va)
+                            __field(u32, rarr)
+                            __field(u32, npages)
+                            __field(dma_addr_t, dma)
+                            ),
+           TP_fast_assign(
+                          __entry->ctxt = ctxt;
+                          __entry->subctxt = subctxt;
+                          __entry->va = va;
+                          __entry->rarr = rarr;
+                          __entry->npages = npages;
+                          __entry->dma = dma;
+                         ),
+           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->rarr,
+                     __entry->npages,
+                     __entry->va,
+                     __entry->dma
+                     )
+           );
+
+TRACE_EVENT(hfi1_mmu_invalidate,
+           TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type,
+                    unsigned long start, unsigned long end),
+           TP_ARGS(ctxt, subctxt, type, start, end),
+           TP_STRUCT__entry(
+                            __field(unsigned int, ctxt)
+                            __field(u16, subctxt)
+                            __string(type, type)
+                            __field(unsigned long, start)
+                            __field(unsigned long, end)
+                            ),
+           TP_fast_assign(
+                       __entry->ctxt = ctxt;
+                       __entry->subctxt = subctxt;
+                       __assign_str(type, type);
+                       __entry->start = start;
+                       __entry->end = end;
+           ),
+           TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __get_str(type),
+                     __entry->start,
+                     __entry->end
+                     )
+           );
+
+#define SNOOP_PRN \
+       "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \
+       "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]"
+
+TRACE_EVENT(snoop_capture,
+           TP_PROTO(struct hfi1_devdata *dd,
+                    int hdr_len,
+                    struct hfi1_ib_header *hdr,
+                    int data_len,
+                    void *data),
+           TP_ARGS(dd, hdr_len, hdr, data_len, data),
+           TP_STRUCT__entry(
+                            DD_DEV_ENTRY(dd)
+                            __field(u16, slid)
+                            __field(u16, dlid)
+                            __field(u32, qpn)
+                            __field(u8, opcode)
+                            __field(u8, sl)
+                            __field(u16, pkey)
+                            __field(u32, hdr_len)
+                            __field(u32, data_len)
+                            __field(u8, lnh)
+                            __dynamic_array(u8, raw_hdr, hdr_len)
+                            __dynamic_array(u8, raw_pkt, data_len)
+                            ),
+           TP_fast_assign(
+               struct hfi1_other_headers *ohdr;
+
+               __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+               if (__entry->lnh == HFI1_LRH_BTH)
+               ohdr = &hdr->u.oth;
+               else
+               ohdr = &hdr->u.l.oth;
+               DD_DEV_ASSIGN(dd);
+               __entry->slid = be16_to_cpu(hdr->lrh[3]);
+               __entry->dlid = be16_to_cpu(hdr->lrh[1]);
+               __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+               __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+               __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+               __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff;
+               __entry->hdr_len = hdr_len;
+               __entry->data_len = data_len;
+               memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len);
+               memcpy(__get_dynamic_array(raw_pkt), data, data_len);
+               ),
+           TP_printk(
+               "[%s] " SNOOP_PRN,
+               __get_str(dev),
+               __entry->slid,
+               __entry->dlid,
+               __entry->qpn,
+               __entry->opcode,
+               show_ib_opcode(__entry->opcode),
+               __entry->sl,
+               __entry->pkey,
+               __entry->hdr_len,
+               __entry->data_len
+               )
+);
+
+#endif /* __HFI1_TRACE_RX_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_rx
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h

new file mode 100644 (file)

index 0000000..415d6be
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_tx.h
@@ -0,0 +1,642 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__HFI1_TRACE_TX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_TX_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "sdma.h"
+
+const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1);
+
+#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_tx
+
+TRACE_EVENT(hfi1_piofree,
+           TP_PROTO(struct send_context *sc, int extra),
+           TP_ARGS(sc, extra),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
+           __field(u32, sw_index)
+           __field(u32, hw_context)
+           __field(int, extra)
+           ),
+           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
+           __entry->sw_index = sc->sw_index;
+           __entry->hw_context = sc->hw_context;
+           __entry->extra = extra;
+           ),
+           TP_printk("[%s] ctxt %u(%u) extra %d",
+                     __get_str(dev),
+                     __entry->sw_index,
+                     __entry->hw_context,
+                     __entry->extra
+           )
+);
+
+TRACE_EVENT(hfi1_wantpiointr,
+           TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
+           TP_ARGS(sc, needint, credit_ctrl),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
+                       __field(u32, sw_index)
+                       __field(u32, hw_context)
+                       __field(u32, needint)
+                       __field(u64, credit_ctrl)
+                       ),
+           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
+                       __entry->sw_index = sc->sw_index;
+                       __entry->hw_context = sc->hw_context;
+                       __entry->needint = needint;
+                       __entry->credit_ctrl = credit_ctrl;
+                       ),
+           TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
+                     __get_str(dev),
+                     __entry->sw_index,
+                     __entry->hw_context,
+                     __entry->needint,
+                     (unsigned long long)__entry->credit_ctrl
+                     )
+);
+
+DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
+                   TP_PROTO(struct rvt_qp *qp, u32 flags),
+                   TP_ARGS(qp, flags),
+                   TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+                   __field(u32, qpn)
+                   __field(u32, flags)
+                   __field(u32, s_flags)
+                   ),
+                   TP_fast_assign(
+                   DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+                   __entry->flags = flags;
+                   __entry->qpn = qp->ibqp.qp_num;
+                   __entry->s_flags = qp->s_flags;
+                   ),
+                   TP_printk(
+                   "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+                   __get_str(dev),
+                   __entry->qpn,
+                   __entry->flags,
+                   __entry->s_flags
+                   )
+);
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
+            TP_PROTO(struct rvt_qp *qp, u32 flags),
+            TP_ARGS(qp, flags));
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
+            TP_PROTO(struct rvt_qp *qp, u32 flags),
+            TP_ARGS(qp, flags));
+
+TRACE_EVENT(hfi1_sdma_descriptor,
+           TP_PROTO(struct sdma_engine *sde,
+                    u64 desc0,
+                    u64 desc1,
+                    u16 e,
+                    void *descp),
+                    TP_ARGS(sde, desc0, desc1, e, descp),
+                    TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                    __field(void *, descp)
+                    __field(u64, desc0)
+                    __field(u64, desc1)
+                    __field(u16, e)
+                    __field(u8, idx)
+                    ),
+                    TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                    __entry->desc0 = desc0;
+                    __entry->desc1 = desc1;
+                    __entry->idx = sde->this_idx;
+                    __entry->descp = descp;
+                    __entry->e = e;
+                    ),
+           TP_printk(
+           "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
+           __get_str(dev),
+           __entry->idx,
+           __parse_sdma_flags(__entry->desc0, __entry->desc1),
+           (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) &
+           SDMA_DESC0_PHY_ADDR_MASK,
+           (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) &
+           SDMA_DESC1_GENERATION_MASK),
+           (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) &
+           SDMA_DESC0_BYTE_COUNT_MASK),
+           __entry->desc0,
+           __entry->desc1,
+           __entry->descp,
+           __entry->e
+           )
+);
+
+TRACE_EVENT(hfi1_sdma_engine_select,
+           TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
+           TP_ARGS(dd, sel, vl, idx),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+           __field(u32, sel)
+           __field(u8, vl)
+           __field(u8, idx)
+           ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+           __entry->sel = sel;
+           __entry->vl = vl;
+           __entry->idx = idx;
+           ),
+           TP_printk("[%s] selecting SDE %u sel 0x%x vl %u",
+                     __get_str(dev),
+                     __entry->idx,
+                     __entry->sel,
+                     __entry->vl
+                     )
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
+                   TP_PROTO(struct sdma_engine *sde, u64 status),
+                   TP_ARGS(sde, status),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                   __field(u64, status)
+                   __field(u8, idx)
+                   ),
+                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                   __entry->status = status;
+                   __entry->idx = sde->this_idx;
+                   ),
+                   TP_printk("[%s] SDE(%u) status %llx",
+                             __get_str(dev),
+                             __entry->idx,
+                             (unsigned long long)__entry->status
+                             )
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
+            TP_PROTO(struct sdma_engine *sde, u64 status),
+            TP_ARGS(sde, status)
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
+            TP_PROTO(struct sdma_engine *sde, u64 status),
+            TP_ARGS(sde, status)
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
+                   TP_PROTO(struct sdma_engine *sde, int aidx),
+                   TP_ARGS(sde, aidx),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                   __field(int, aidx)
+                   __field(u8, idx)
+                   ),
+                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                   __entry->idx = sde->this_idx;
+                   __entry->aidx = aidx;
+                   ),
+                   TP_printk("[%s] SDE(%u) aidx %d",
+                             __get_str(dev),
+                             __entry->idx,
+                             __entry->aidx
+                             )
+);
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
+            TP_PROTO(struct sdma_engine *sde, int aidx),
+            TP_ARGS(sde, aidx));
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
+            TP_PROTO(struct sdma_engine *sde, int aidx),
+            TP_ARGS(sde, aidx));
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+TRACE_EVENT(hfi1_sdma_progress,
+           TP_PROTO(struct sdma_engine *sde,
+                    u16 hwhead,
+                    u16 swhead,
+                    struct sdma_txreq *txp
+                    ),
+           TP_ARGS(sde, hwhead, swhead, txp),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+           __field(u64, sn)
+           __field(u16, hwhead)
+           __field(u16, swhead)
+           __field(u16, txnext)
+           __field(u16, tx_tail)
+           __field(u16, tx_head)
+           __field(u8, idx)
+           ),
+           TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+           __entry->hwhead = hwhead;
+           __entry->swhead = swhead;
+           __entry->tx_tail = sde->tx_tail;
+           __entry->tx_head = sde->tx_head;
+           __entry->txnext = txp ? txp->next_descq_idx : ~0;
+           __entry->idx = sde->this_idx;
+           __entry->sn = txp ? txp->sn : ~0;
+           ),
+           TP_printk(
+           "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+           __get_str(dev),
+           __entry->idx,
+           __entry->sn,
+           __entry->hwhead,
+           __entry->swhead,
+           __entry->txnext,
+           __entry->tx_head,
+           __entry->tx_tail
+           )
+);
+#else
+TRACE_EVENT(hfi1_sdma_progress,
+           TP_PROTO(struct sdma_engine *sde,
+                    u16 hwhead, u16 swhead,
+                    struct sdma_txreq *txp
+                    ),
+           TP_ARGS(sde, hwhead, swhead, txp),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                   __field(u16, hwhead)
+                   __field(u16, swhead)
+                   __field(u16, txnext)
+                   __field(u16, tx_tail)
+                   __field(u16, tx_head)
+                   __field(u8, idx)
+                   ),
+           TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                   __entry->hwhead = hwhead;
+                   __entry->swhead = swhead;
+                   __entry->tx_tail = sde->tx_tail;
+                   __entry->tx_head = sde->tx_head;
+                   __entry->txnext = txp ? txp->next_descq_idx : ~0;
+                   __entry->idx = sde->this_idx;
+                   ),
+           TP_printk(
+                   "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+                   __get_str(dev),
+                   __entry->idx,
+                   __entry->hwhead,
+                   __entry->swhead,
+                   __entry->txnext,
+                   __entry->tx_head,
+                   __entry->tx_tail
+           )
+);
+#endif
+
+DECLARE_EVENT_CLASS(hfi1_sdma_sn,
+                   TP_PROTO(struct sdma_engine *sde, u64 sn),
+                   TP_ARGS(sde, sn),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                   __field(u64, sn)
+                   __field(u8, idx)
+                   ),
+                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                   __entry->sn = sn;
+                   __entry->idx = sde->this_idx;
+                   ),
+                   TP_printk("[%s] SDE(%u) sn %llu",
+                             __get_str(dev),
+                             __entry->idx,
+                             __entry->sn
+                             )
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
+            TP_PROTO(
+            struct sdma_engine *sde,
+            u64 sn
+            ),
+            TP_ARGS(sde, sn)
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
+            TP_PROTO(struct sdma_engine *sde, u64 sn),
+            TP_ARGS(sde, sn)
+);
+
+#define USDMA_HDR_FORMAT \
+       "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
+
+TRACE_EVENT(hfi1_sdma_user_header,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+                    struct hfi1_pkt_header *hdr, u32 tidval),
+           TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd)
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u16, req)
+                   __field(u32, pbc0)
+                   __field(u32, pbc1)
+                   __field(u32, lrh0)
+                   __field(u32, lrh1)
+                   __field(u32, bth0)
+                   __field(u32, bth1)
+                   __field(u32, bth2)
+                   __field(u32, kdeth0)
+                   __field(u32, kdeth1)
+                   __field(u32, kdeth2)
+                   __field(u32, kdeth3)
+                   __field(u32, kdeth4)
+                   __field(u32, kdeth5)
+                   __field(u32, kdeth6)
+                   __field(u32, kdeth7)
+                   __field(u32, kdeth8)
+                   __field(u32, tidval)
+                   ),
+                   TP_fast_assign(
+                   __le32 *pbc = (__le32 *)hdr->pbc;
+                   __be32 *lrh = (__be32 *)hdr->lrh;
+                   __be32 *bth = (__be32 *)hdr->bth;
+                   __le32 *kdeth = (__le32 *)&hdr->kdeth;
+
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->req = req;
+                   __entry->pbc0 = le32_to_cpu(pbc[0]);
+                   __entry->pbc1 = le32_to_cpu(pbc[1]);
+                   __entry->lrh0 = be32_to_cpu(lrh[0]);
+                   __entry->lrh1 = be32_to_cpu(lrh[1]);
+                   __entry->bth0 = be32_to_cpu(bth[0]);
+                   __entry->bth1 = be32_to_cpu(bth[1]);
+                   __entry->bth2 = be32_to_cpu(bth[2]);
+                   __entry->kdeth0 = le32_to_cpu(kdeth[0]);
+                   __entry->kdeth1 = le32_to_cpu(kdeth[1]);
+                   __entry->kdeth2 = le32_to_cpu(kdeth[2]);
+                   __entry->kdeth3 = le32_to_cpu(kdeth[3]);
+                   __entry->kdeth4 = le32_to_cpu(kdeth[4]);
+                   __entry->kdeth5 = le32_to_cpu(kdeth[5]);
+                   __entry->kdeth6 = le32_to_cpu(kdeth[6]);
+                   __entry->kdeth7 = le32_to_cpu(kdeth[7]);
+                   __entry->kdeth8 = le32_to_cpu(kdeth[8]);
+                   __entry->tidval = tidval;
+           ),
+           TP_printk(USDMA_HDR_FORMAT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->req,
+                     __entry->pbc1,
+                     __entry->pbc0,
+                     __entry->lrh0,
+                     __entry->lrh1,
+                     __entry->bth0,
+                     __entry->bth1,
+                     __entry->bth2,
+                     __entry->kdeth0,
+                     __entry->kdeth1,
+                     __entry->kdeth2,
+                     __entry->kdeth3,
+                     __entry->kdeth4,
+                     __entry->kdeth5,
+                     __entry->kdeth6,
+                     __entry->kdeth7,
+                     __entry->kdeth8,
+                     __entry->tidval
+           )
+);
+
+#define SDMA_UREQ_FMT \
+       "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
+TRACE_EVENT(hfi1_sdma_user_reqinfo,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
+           TP_ARGS(dd, ctxt, subctxt, i),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd);
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u8, ver_opcode)
+                   __field(u8, iovcnt)
+                   __field(u16, npkts)
+                   __field(u16, fragsize)
+                   __field(u16, comp_idx)
+           ),
+           TP_fast_assign(
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->ver_opcode = i[0] & 0xff;
+                   __entry->iovcnt = (i[0] >> 8) & 0xff;
+                   __entry->npkts = i[1];
+                   __entry->fragsize = i[2];
+                   __entry->comp_idx = i[3];
+           ),
+           TP_printk(SDMA_UREQ_FMT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->ver_opcode,
+                     __entry->iovcnt,
+                     __entry->npkts,
+                     __entry->fragsize,
+                     __entry->comp_idx
+                     )
+);
+
+#define usdma_complete_name(st) { st, #st }
+#define show_usdma_complete_state(st)                  \
+       __print_symbolic(st,                            \
+                       usdma_complete_name(FREE),      \
+                       usdma_complete_name(QUEUED),    \
+                       usdma_complete_name(COMPLETE), \
+                       usdma_complete_name(ERROR))
+
+TRACE_EVENT(hfi1_sdma_user_completion,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
+                    u8 state, int code),
+           TP_ARGS(dd, ctxt, subctxt, idx, state, code),
+           TP_STRUCT__entry(
+           DD_DEV_ENTRY(dd)
+           __field(u16, ctxt)
+           __field(u8, subctxt)
+           __field(u16, idx)
+           __field(u8, state)
+           __field(int, code)
+           ),
+           TP_fast_assign(
+           DD_DEV_ASSIGN(dd);
+           __entry->ctxt = ctxt;
+           __entry->subctxt = subctxt;
+           __entry->idx = idx;
+           __entry->state = state;
+           __entry->code = code;
+           ),
+           TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
+                     __get_str(dev), __entry->ctxt, __entry->subctxt,
+                     __entry->idx, show_usdma_complete_state(__entry->state),
+                     __entry->code)
+);
+
+const char *print_u32_array(struct trace_seq *, u32 *, int);
+#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
+
+TRACE_EVENT(hfi1_sdma_user_header_ahg,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+                    u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
+           TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
+           TP_STRUCT__entry(
+           DD_DEV_ENTRY(dd)
+           __field(u16, ctxt)
+           __field(u8, subctxt)
+           __field(u16, req)
+           __field(u8, sde)
+           __field(u8, idx)
+           __field(int, len)
+           __field(u32, tidval)
+           __array(u32, ahg, 10)
+           ),
+           TP_fast_assign(
+           DD_DEV_ASSIGN(dd);
+           __entry->ctxt = ctxt;
+           __entry->subctxt = subctxt;
+           __entry->req = req;
+           __entry->sde = sde;
+           __entry->idx = ahgidx;
+           __entry->len = len;
+           __entry->tidval = tidval;
+           memcpy(__entry->ahg, ahg, len * sizeof(u32));
+           ),
+           TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->req,
+                     __entry->sde,
+                     __entry->idx,
+                     __entry->len - 1,
+                     __print_u32_hex(__entry->ahg, __entry->len),
+                     __entry->tidval
+                     )
+);
+
+TRACE_EVENT(hfi1_sdma_state,
+           TP_PROTO(struct sdma_engine *sde,
+                    const char *cstate,
+                    const char *nstate
+                    ),
+           TP_ARGS(sde, cstate, nstate),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+               __string(curstate, cstate)
+               __string(newstate, nstate)
+           ),
+           TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+               __assign_str(curstate, cstate);
+               __assign_str(newstate, nstate);
+           ),
+           TP_printk("[%s] current state %s new state %s",
+                     __get_str(dev),
+                     __get_str(curstate),
+                     __get_str(newstate)
+           )
+);
+
+#define BCT_FORMAT \
+       "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
+
+#define BCT(field) \
+       be16_to_cpu( \
+       ((struct buffer_control *)__get_dynamic_array(bct))->field \
+       )
+
+DECLARE_EVENT_CLASS(hfi1_bct_template,
+                   TP_PROTO(struct hfi1_devdata *dd,
+                            struct buffer_control *bc),
+                   TP_ARGS(dd, bc),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                   __dynamic_array(u8, bct, sizeof(*bc))
+                   ),
+                   TP_fast_assign(DD_DEV_ASSIGN(dd);
+                                  memcpy(__get_dynamic_array(bct), bc,
+                                         sizeof(*bc));
+                   ),
+                   TP_printk(BCT_FORMAT,
+                             BCT(overall_shared_limit),
+
+                             BCT(vl[0].dedicated),
+                             BCT(vl[0].shared),
+
+                             BCT(vl[1].dedicated),
+                             BCT(vl[1].shared),
+
+                             BCT(vl[2].dedicated),
+                             BCT(vl[2].shared),
+
+                             BCT(vl[3].dedicated),
+                             BCT(vl[3].shared),
+
+                             BCT(vl[4].dedicated),
+                             BCT(vl[4].shared),
+
+                             BCT(vl[5].dedicated),
+                             BCT(vl[5].shared),
+
+                             BCT(vl[6].dedicated),
+                             BCT(vl[6].shared),
+
+                             BCT(vl[7].dedicated),
+                             BCT(vl[7].shared),
+
+                             BCT(vl[15].dedicated),
+                             BCT(vl[15].shared)
+                   )
+);
+
+DEFINE_EVENT(hfi1_bct_template, bct_set,
+            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+            TP_ARGS(dd, bc));
+
+DEFINE_EVENT(hfi1_bct_template, bct_get,
+            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+            TP_ARGS(dd, bc));
+
+#endif /* __HFI1_TRACE_TX_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_tx
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/twsi.c b/drivers/infiniband/hw/hfi1/twsi.c

deleted file mode 100644 (file)

index e82e52a..0000000
--- a/drivers/infiniband/hw/hfi1/twsi.c
+++ /dev/null
@@ -1,489 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <linux/vmalloc.h>
-
-#include "hfi.h"
-#include "twsi.h"
-
-/*
- * "Two Wire Serial Interface" support.
- *
- * Originally written for a not-quite-i2c serial eeprom, which is
- * still used on some supported boards. Later boards have added a
- * variety of other uses, most board-specific, so the bit-boffing
- * part has been split off to this file, while the other parts
- * have been moved to chip-specific files.
- *
- * We have also dropped all pretense of fully generic (e.g. pretend
- * we don't know whether '1' is the higher voltage) interface, as
- * the restrictions of the generic i2c interface (e.g. no access from
- * driver itself) make it unsuitable for this use.
- */
-
-#define READ_CMD 1
-#define WRITE_CMD 0
-
-/**
- * i2c_wait_for_writes - wait for a write
- * @dd: the hfi1_ib device
- *
- * We use this instead of udelay directly, so we can make sure
- * that previous register writes have been flushed all the way
- * to the chip.  Since we are delaying anyway, the cost doesn't
- * hurt, and makes the bit twiddling more regular
- */
-static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target)
-{
-       /*
-        * implicit read of EXTStatus is as good as explicit
-        * read of scratch, if all we want to do is flush
-        * writes.
-        */
-       hfi1_gpio_mod(dd, target, 0, 0, 0);
-       rmb(); /* inlined, so prevent compiler reordering */
-}
-
-/*
- * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that
- * for "almost compliant" modules
- */
-#define SCL_WAIT_USEC 1000
-
-/* BUF_WAIT is time bus must be free between STOP or ACK and to next START.
- * Should be 20, but some chips need more.
- */
-#define TWSI_BUF_WAIT_USEC 60
-
-static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit)
-{
-       u32 mask;
-
-       udelay(1);
-
-       mask = QSFP_HFI0_I2CCLK;
-
-       /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
-
-       /*
-        * Allow for slow slaves by simple
-        * delay for falling edge, sampling on rise.
-        */
-       if (!bit) {
-               udelay(2);
-       } else {
-               int rise_usec;
-
-               for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) {
-                       if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0))
-                               break;
-                       udelay(2);
-               }
-               if (rise_usec <= 0)
-                       dd_dev_err(dd, "SCL interface stuck low > %d uSec\n",
-                                  SCL_WAIT_USEC);
-       }
-       i2c_wait_for_writes(dd, target);
-}
-
-static u8 scl_in(struct hfi1_devdata *dd, u32 target, int wait)
-{
-       u32 read_val, mask;
-
-       mask = QSFP_HFI0_I2CCLK;
-       /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, 0, mask);
-       read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
-       if (wait)
-               i2c_wait_for_writes(dd, target);
-       return (read_val & mask) >> GPIO_SCL_NUM;
-}
-
-static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit)
-{
-       u32 mask;
-
-       mask = QSFP_HFI0_I2CDAT;
-
-       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
-
-       i2c_wait_for_writes(dd, target);
-       udelay(2);
-}
-
-static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait)
-{
-       u32 read_val, mask;
-
-       mask = QSFP_HFI0_I2CDAT;
-       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, 0, mask);
-       read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
-       if (wait)
-               i2c_wait_for_writes(dd, target);
-       return (read_val & mask) >> GPIO_SDA_NUM;
-}
-
-/**
- * i2c_ackrcv - see if ack following write is true
- * @dd: the hfi1_ib device
- */
-static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target)
-{
-       u8 ack_received;
-
-       /* AT ENTRY SCL = LOW */
-       /* change direction, ignore data */
-       ack_received = sda_in(dd, target, 1);
-       scl_out(dd, target, 1);
-       ack_received = sda_in(dd, target, 1) == 0;
-       scl_out(dd, target, 0);
-       return ack_received;
-}
-
-static void stop_cmd(struct hfi1_devdata *dd, u32 target);
-
-/**
- * rd_byte - read a byte, sending STOP on last, else ACK
- * @dd: the hfi1_ib device
- *
- * Returns byte shifted out of device
- */
-static int rd_byte(struct hfi1_devdata *dd, u32 target, int last)
-{
-       int bit_cntr, data;
-
-       data = 0;
-
-       for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
-               data <<= 1;
-               scl_out(dd, target, 1);
-               data |= sda_in(dd, target, 0);
-               scl_out(dd, target, 0);
-       }
-       if (last) {
-               scl_out(dd, target, 1);
-               stop_cmd(dd, target);
-       } else {
-               sda_out(dd, target, 0);
-               scl_out(dd, target, 1);
-               scl_out(dd, target, 0);
-               sda_out(dd, target, 1);
-       }
-       return data;
-}
-
-/**
- * wr_byte - write a byte, one bit at a time
- * @dd: the hfi1_ib device
- * @data: the byte to write
- *
- * Returns 0 if we got the following ack, otherwise 1
- */
-static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data)
-{
-       int bit_cntr;
-       u8 bit;
-
-       for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
-               bit = (data >> bit_cntr) & 1;
-               sda_out(dd, target, bit);
-               scl_out(dd, target, 1);
-               scl_out(dd, target, 0);
-       }
-       return (!i2c_ackrcv(dd, target)) ? 1 : 0;
-}
-
-/*
- * issue TWSI start sequence:
- * (both clock/data high, clock high, data low while clock is high)
- */
-static void start_seq(struct hfi1_devdata *dd, u32 target)
-{
-       sda_out(dd, target, 1);
-       scl_out(dd, target, 1);
-       sda_out(dd, target, 0);
-       udelay(1);
-       scl_out(dd, target, 0);
-}
-
-/**
- * stop_seq - transmit the stop sequence
- * @dd: the hfi1_ib device
- *
- * (both clock/data low, clock high, data high while clock is high)
- */
-static void stop_seq(struct hfi1_devdata *dd, u32 target)
-{
-       scl_out(dd, target, 0);
-       sda_out(dd, target, 0);
-       scl_out(dd, target, 1);
-       sda_out(dd, target, 1);
-}
-
-/**
- * stop_cmd - transmit the stop condition
- * @dd: the hfi1_ib device
- *
- * (both clock/data low, clock high, data high while clock is high)
- */
-static void stop_cmd(struct hfi1_devdata *dd, u32 target)
-{
-       stop_seq(dd, target);
-       udelay(TWSI_BUF_WAIT_USEC);
-}
-
-/**
- * hfi1_twsi_reset - reset I2C communication
- * @dd: the hfi1_ib device
- * returns 0 if ok, -EIO on error
- */
-int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target)
-{
-       int clock_cycles_left = 9;
-       u32 mask;
-
-       /* Both SCL and SDA should be high. If not, there
-        * is something wrong.
-        */
-       mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT;
-
-       /*
-        * Force pins to desired innocuous state.
-        * This is the default power-on state with out=0 and dir=0,
-        * So tri-stated and should be floating high (barring HW problems)
-        */
-       hfi1_gpio_mod(dd, target, 0, 0, mask);
-
-       /* Check if SCL is low, if it is low then we have a slave device
-        * misbehaving and there is not much we can do.
-        */
-       if (!scl_in(dd, target, 0))
-               return -EIO;
-
-       /* Check if SDA is low, if it is low then we have to clock SDA
-        * up to 9 times for the device to release the bus
-        */
-       while (clock_cycles_left--) {
-               if (sda_in(dd, target, 0))
-                       return 0;
-               scl_out(dd, target, 0);
-               scl_out(dd, target, 1);
-       }
-
-       return -EIO;
-}
-
-#define HFI1_TWSI_START 0x100
-#define HFI1_TWSI_STOP 0x200
-
-/* Write byte to TWSI, optionally prefixed with START or suffixed with
- * STOP.
- * returns 0 if OK (ACK received), else != 0
- */
-static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags)
-{
-       int ret = 1;
-
-       if (flags & HFI1_TWSI_START)
-               start_seq(dd, target);
-
-       /* Leaves SCL low (from i2c_ackrcv()) */
-       ret = wr_byte(dd, target, data);
-
-       if (flags & HFI1_TWSI_STOP)
-               stop_cmd(dd, target);
-       return ret;
-}
-
-/* Added functionality for IBA7220-based cards */
-#define HFI1_TEMP_DEV 0x98
-
-/*
- * hfi1_twsi_blk_rd
- * General interface for data transfer from twsi devices.
- * One vestige of its former role is that it recognizes a device
- * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
- * which responded to all TWSI device codes, interpreting them as
- * address within device. On all other devices found on board handled by
- * this driver, the device is followed by a N-byte "address" which selects
- * the "register" or "offset" within the device from which data should
- * be read.
- */
-int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    void *buffer, int len)
-{
-       u8 *bp = buffer;
-       int ret = 1;
-       int i;
-       int offset_size;
-
-       /* obtain the offset size, strip it from the device address */
-       offset_size = (dev >> 8) & 0xff;
-       dev &= 0xff;
-
-       /* allow at most a 2 byte offset */
-       if (offset_size > 2)
-               goto bail;
-
-       if (dev == HFI1_TWSI_NO_DEV) {
-               /* legacy not-really-I2C */
-               addr = (addr << 1) | READ_CMD;
-               ret = twsi_wr(dd, target, addr, HFI1_TWSI_START);
-       } else {
-               /* Actual I2C */
-               if (offset_size) {
-                       ret = twsi_wr(dd, target,
-                                     dev | WRITE_CMD, HFI1_TWSI_START);
-                       if (ret) {
-                               stop_cmd(dd, target);
-                               goto bail;
-                       }
-
-                       for (i = 0; i < offset_size; i++) {
-                               ret = twsi_wr(dd, target,
-                                             (addr >> (i * 8)) & 0xff, 0);
-                               udelay(TWSI_BUF_WAIT_USEC);
-                               if (ret) {
-                                       dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n",
-                                                  i, addr);
-                                       goto bail;
-                               }
-                       }
-               }
-               ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START);
-       }
-       if (ret) {
-               stop_cmd(dd, target);
-               goto bail;
-       }
-
-       /*
-        * block devices keeps clocking data out as long as we ack,
-        * automatically incrementing the address. Some have "pages"
-        * whose boundaries will not be crossed, but the handling
-        * of these is left to the caller, who is in a better
-        * position to know.
-        */
-       while (len-- > 0) {
-               /*
-                * Get and store data, sending ACK if length remaining,
-                * else STOP
-                */
-               *bp++ = rd_byte(dd, target, !len);
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/*
- * hfi1_twsi_blk_wr
- * General interface for data transfer to twsi devices.
- * One vestige of its former role is that it recognizes a device
- * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
- * which responded to all TWSI device codes, interpreting them as
- * address within device. On all other devices found on board handled by
- * this driver, the device is followed by a N-byte "address" which selects
- * the "register" or "offset" within the device to which data should
- * be written.
- */
-int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    const void *buffer, int len)
-{
-       const u8 *bp = buffer;
-       int ret = 1;
-       int i;
-       int offset_size;
-
-       /* obtain the offset size, strip it from the device address */
-       offset_size = (dev >> 8) & 0xff;
-       dev &= 0xff;
-
-       /* allow at most a 2 byte offset */
-       if (offset_size > 2)
-               goto bail;
-
-       if (dev == HFI1_TWSI_NO_DEV) {
-               if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD,
-                           HFI1_TWSI_START)) {
-                       goto failed_write;
-               }
-       } else {
-               /* Real I2C */
-               if (twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START))
-                       goto failed_write;
-       }
-
-       for (i = 0; i < offset_size; i++) {
-               ret = twsi_wr(dd, target, (addr >> (i * 8)) & 0xff, 0);
-               udelay(TWSI_BUF_WAIT_USEC);
-               if (ret) {
-                       dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n",
-                                  i, addr);
-                       goto bail;
-               }
-       }
-
-       for (i = 0; i < len; i++)
-               if (twsi_wr(dd, target, *bp++, 0))
-                       goto failed_write;
-
-       ret = 0;
-
-failed_write:
-       stop_cmd(dd, target);
-
-bail:
-       return ret;
-}
diff --git a/drivers/infiniband/hw/hfi1/twsi.h b/drivers/infiniband/hw/hfi1/twsi.h

deleted file mode 100644 (file)

index 5b8a5b5..0000000
--- a/drivers/infiniband/hw/hfi1/twsi.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef _TWSI_H
-#define _TWSI_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#define HFI1_TWSI_NO_DEV 0xFF
-
-struct hfi1_devdata;
-
-/* Bit position of SDA/SCL pins in ASIC_QSFP* registers  */
-#define  GPIO_SDA_NUM 1
-#define  GPIO_SCL_NUM 0
-
-/* these functions must be called with qsfp_lock held */
-int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target);
-int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    void *buffer, int len);
-int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    const void *buffer, int len);
-
-#endif /* _TWSI_H */
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c

index df773d4332970bd2ed2e9567d59fcf55c397f0cd..a726d96d185f3261ac77f99b73fe6112aa90c914 100644 (file)
--- a/drivers/infiniband/hw/hfi1/uc.c
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -118,6 +118,31 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                         clear_ahg(qp);
                         goto bail;
                 }
+               /*
+                * Local operations are processed immediately
+                * after all prior requests have completed.
+                */
+               if (wqe->wr.opcode == IB_WR_REG_MR ||
+                   wqe->wr.opcode == IB_WR_LOCAL_INV) {
+                       int local_ops = 0;
+                       int err = 0;
+
+                       if (qp->s_last != qp->s_cur)
+                               goto bail;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
+                               err = rvt_invalidate_rkey(
+                                       qp, wqe->wr.ex.invalidate_rkey);
+                               local_ops = 1;
+                       }
+                       hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR
+                                                       : IB_WC_SUCCESS);
+                       if (local_ops)
+                               atomic_dec(&qp->local_ops_pending);
+                       qp->s_hdrwords = 0;
+                       goto done_free_tx;
+               }
                 /*
                  * Start a new request.
                  */
@@ -294,46 +319,12 @@ void hfi1_uc_rcv(struct hfi1_packet *packet)
         struct ib_reth *reth;
         int has_grh = rcv_flags & HFI1_HAS_GRH;
         int ret;
-       u32 bth1;
  
         bth0 = be32_to_cpu(ohdr->bth[0]);
         if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
                 return;
  
-       bth1 = be32_to_cpu(ohdr->bth[1]);
-       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
-               if (bth1 & HFI1_BECN_SMASK) {
-                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-                       u32 rqpn, lqpn;
-                       u16 rlid = be16_to_cpu(hdr->lrh[3]);
-                       u8 sl, sc5;
-
-                       lqpn = bth1 & RVT_QPN_MASK;
-                       rqpn = qp->remote_qpn;
-
-                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-                       sl = ibp->sc_to_sl[sc5];
-
-                       process_becn(ppd, sl, rlid, lqpn, rqpn,
-                                    IB_CC_SVCTYPE_UC);
-               }
-
-               if (bth1 & HFI1_FECN_SMASK) {
-                       struct ib_grh *grh = NULL;
-                       u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
-                       u16 slid = be16_to_cpu(hdr->lrh[3]);
-                       u16 dlid = be16_to_cpu(hdr->lrh[1]);
-                       u32 src_qp = qp->remote_qpn;
-                       u8 sc5;
-
-                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-                       if (has_grh)
-                               grh = &hdr->u.l.grh;
-
-                       return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5,
-                                  grh);
-               }
-       }
+       process_ecn(qp, packet, true);
  
         psn = be32_to_cpu(ohdr->bth[2]);
         opcode = (bth0 >> 24) & 0xff;
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c

index be91f6fa1c87b16fe4f31affd6c12848b1951792..f01e8e1d62d3d14d6c6d543630da9e9128695738 100644 (file)
--- a/drivers/infiniband/hw/hfi1/ud.c
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -184,8 +184,12 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
         }
  
         if (ah_attr->ah_flags & IB_AH_GRH) {
-               hfi1_copy_sge(&qp->r_sge, &ah_attr->grh,
-                             sizeof(struct ib_grh), 1, 0);
+               struct ib_grh grh;
+               struct ib_global_route grd = ah_attr->grh;
+
+               hfi1_make_grh(ibp, &grh, &grd, 0, 0);
+               hfi1_copy_sge(&qp->r_sge, &grh,
+                             sizeof(grh), 1, 0);
                 wc.wc_flags |= IB_WC_GRH;
         } else {
                 hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
@@ -430,10 +434,9 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                                          qp->qkey : wqe->ud_wr.remote_qkey);
         ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
         /* disarm any ahg */
-       priv->s_hdr->ahgcount = 0;
-       priv->s_hdr->ahgidx = 0;
-       priv->s_hdr->tx_flags = 0;
-       priv->s_hdr->sde = NULL;
+       priv->s_ahg->ahgcount = 0;
+       priv->s_ahg->ahgidx = 0;
+       priv->s_ahg->tx_flags = 0;
         /* pbc */
         ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
  
@@ -665,13 +668,13 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
         struct hfi1_other_headers *ohdr = packet->ohdr;
         int opcode;
         u32 hdrsize = packet->hlen;
-       u32 pad;
         struct ib_wc wc;
         u32 qkey;
         u32 src_qp;
         u16 dlid, pkey;
         int mgmt_pkey_idx = -1;
         struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
         struct hfi1_ib_header *hdr = packet->hdr;
         u32 rcv_flags = packet->rcv_flags;
         void *data = packet->ebuf;
@@ -680,52 +683,33 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
         bool has_grh = rcv_flags & HFI1_HAS_GRH;
         u8 sc5 = hdr2sc((struct hfi1_message_header *)hdr, packet->rhf);
         u32 bth1;
-       int is_mcast;
-       struct ib_grh *grh = NULL;
+       u8 sl_from_sc, sl;
+       u16 slid;
+       u8 extra_bytes;
  
         qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
         src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
         dlid = be16_to_cpu(hdr->lrh[1]);
-       is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
-                       (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
         bth1 = be32_to_cpu(ohdr->bth[1]);
-       if (unlikely(bth1 & HFI1_BECN_SMASK)) {
-               /*
-                * In pre-B0 h/w the CNP_OPCODE is handled via an
-                * error path.
-                */
-               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-               u32 lqpn =  be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-               u8 sl;
-
-               sl = ibp->sc_to_sl[sc5];
-
-               process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD);
-       }
+       slid = be16_to_cpu(hdr->lrh[3]);
+       pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+       sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+       extra_bytes = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+       extra_bytes += (SIZE_OF_CRC << 2);
+       sl_from_sc = ibp->sc_to_sl[sc5];
  
-       /*
-        * The opcode is in the low byte when its in network order
-        * (top byte when in host order).
-        */
         opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
         opcode &= 0xff;
  
-       pkey = (u16)be32_to_cpu(ohdr->bth[0]);
-
-       if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) {
-               u16 slid = be16_to_cpu(hdr->lrh[3]);
-
-               return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
-       }
+       process_ecn(qp, packet, (opcode != IB_OPCODE_CNP));
         /*
          * Get the number of bytes the message was padded by
          * and drop incomplete packets.
          */
-       pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-       if (unlikely(tlen < (hdrsize + pad + 4)))
+       if (unlikely(tlen < (hdrsize + extra_bytes)))
                 goto drop;
  
-       tlen -= hdrsize + pad + 4;
+       tlen -= hdrsize + extra_bytes;
  
         /*
          * Check that the permissive LID is only used on QP0
@@ -736,10 +720,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
                              hdr->lrh[3] == IB_LID_PERMISSIVE))
                         goto drop;
                 if (qp->ibqp.qp_num > 1) {
-                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-                       u16 slid;
-
-                       slid = be16_to_cpu(hdr->lrh[3]);
                         if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) {
                                 /*
                                  * Traps will not be sent for packets dropped
@@ -748,12 +728,9 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
                                  * IB spec (release 1.3, section 10.9.4)
                                  */
                                 hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
-                                              pkey,
-                                              (be16_to_cpu(hdr->lrh[0]) >> 4) &
-                                               0xF,
+                                              pkey, sl,
                                                src_qp, qp->ibqp.qp_num,
-                                              be16_to_cpu(hdr->lrh[3]),
-                                              be16_to_cpu(hdr->lrh[1]));
+                                              slid, dlid);
                                 return;
                         }
                 } else {
@@ -763,22 +740,18 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
                                 goto drop;
                 }
                 if (unlikely(qkey != qp->qkey)) {
-                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey,
-                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey, sl,
                                        src_qp, qp->ibqp.qp_num,
-                                      be16_to_cpu(hdr->lrh[3]),
-                                      be16_to_cpu(hdr->lrh[1]));
+                                      slid, dlid);
                         return;
                 }
                 /* Drop invalid MAD packets (see 13.5.3.1). */
                 if (unlikely(qp->ibqp.qp_num == 1 &&
-                            (tlen > 2048 ||
-                             (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))
+                            (tlen > 2048 || (sc5 == 0xF))))
                         goto drop;
         } else {
                 /* Received on QP0, and so by definition, this is an SMP */
                 struct opa_smp *smp = (struct opa_smp *)data;
-               u16 slid = be16_to_cpu(hdr->lrh[3]);
  
                 if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp))
                         goto drop;
@@ -861,7 +834,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
             qp->ibqp.qp_type == IB_QPT_SMI) {
                 if (mgmt_pkey_idx < 0) {
                         if (net_ratelimit()) {
-                               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
                                 struct hfi1_devdata *dd = ppd->dd;
  
                                 dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n",
@@ -874,8 +846,8 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
                 wc.pkey_index = 0;
         }
  
-       wc.slid = be16_to_cpu(hdr->lrh[3]);
-       wc.sl = ibp->sc_to_sl[sc5];
+       wc.slid = slid;
+       wc.sl = sl_from_sc;
  
         /*
          * Save the LMC lower bits if the destination LID is a unicast LID.
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c

index 1b640a35b3fe82f6f85022599477f6760744f034..64d26525435af43172ec9518b671a2758b0dcf15 100644 (file)
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -82,24 +82,25 @@ struct tid_pageset {
                ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
  
  static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *,
-                           struct rb_root *);
+                           struct hfi1_filedata *);
  static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *);
  static int set_rcvarray_entry(struct file *, unsigned long, u32,
                               struct tid_group *, struct page **, unsigned);
-static int mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
-static void mmu_rb_remove(struct rb_root *, struct mmu_rb_node *,
-                         struct mm_struct *);
-static int mmu_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
+static int tid_rb_insert(void *, struct mmu_rb_node *);
+static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
+                                   struct tid_rb_node *tnode);
+static void tid_rb_remove(void *, struct mmu_rb_node *);
+static int tid_rb_invalidate(void *, struct mmu_rb_node *);
  static int program_rcvarray(struct file *, unsigned long, struct tid_group *,
                             struct tid_pageset *, unsigned, u16, struct page **,
                             u32 *, unsigned *, unsigned *);
  static int unprogram_rcvarray(struct file *, u32, struct tid_group **);
-static void clear_tid_node(struct hfi1_filedata *, u16, struct tid_rb_node *);
+static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
  
  static struct mmu_rb_ops tid_rb_ops = {
-       .insert = mmu_rb_insert,
-       .remove = mmu_rb_remove,
-       .invalidate = mmu_rb_invalidate
+       .insert = tid_rb_insert,
+       .remove = tid_rb_remove,
+       .invalidate = tid_rb_invalidate
  };
  
  static inline u32 rcventry2tidinfo(u32 rcventry)
@@ -162,7 +163,6 @@ int hfi1_user_exp_rcv_init(struct file *fp)
  
         spin_lock_init(&fd->tid_lock);
         spin_lock_init(&fd->invalid_lock);
-       fd->tid_rb_root = RB_ROOT;
  
         if (!uctxt->subctxt_cnt || !fd->subctxt) {
                 exp_tid_group_init(&uctxt->tid_group_list);
@@ -197,7 +197,7 @@ int hfi1_user_exp_rcv_init(struct file *fp)
         if (!fd->entry_to_rb)
                 return -ENOMEM;
  
-       if (!HFI1_CAP_IS_USET(TID_UNMAP)) {
+       if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
                 fd->invalid_tid_idx = 0;
                 fd->invalid_tids = kzalloc(uctxt->expected_count *
                                            sizeof(u32), GFP_KERNEL);
@@ -208,15 +208,15 @@ int hfi1_user_exp_rcv_init(struct file *fp)
  
                 /*
                  * Register MMU notifier callbacks. If the registration
-                * fails, continue but turn off the TID caching for
-                * all user contexts.
+                * fails, continue without TID caching for this context.
                  */
-               ret = hfi1_mmu_rb_register(&fd->tid_rb_root, &tid_rb_ops);
+               ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops,
+                                          dd->pport->hfi1_wq,
+                                          &fd->handler);
                 if (ret) {
                         dd_dev_info(dd,
                                     "Failed MMU notifier registration %d\n",
                                     ret);
-                       HFI1_CAP_USET(TID_UNMAP);
                         ret = 0;
                 }
         }
@@ -235,7 +235,7 @@ int hfi1_user_exp_rcv_init(struct file *fp)
          * init.
          */
         spin_lock(&fd->tid_lock);
-       if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) {
+       if (uctxt->subctxt_cnt && fd->handler) {
                 u16 remainder;
  
                 fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
@@ -261,18 +261,16 @@ int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
          * The notifier would have been removed when the process'es mm
          * was freed.
          */
-       if (!HFI1_CAP_IS_USET(TID_UNMAP))
-               hfi1_mmu_rb_unregister(&fd->tid_rb_root);
+       if (fd->handler)
+               hfi1_mmu_rb_unregister(fd->handler);
  
         kfree(fd->invalid_tids);
  
         if (!uctxt->cnt) {
                 if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
-                       unlock_exp_tids(uctxt, &uctxt->tid_full_list,
-                                       &fd->tid_rb_root);
+                       unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
                 if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
-                       unlock_exp_tids(uctxt, &uctxt->tid_used_list,
-                                       &fd->tid_rb_root);
+                       unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
                 list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
                                          list) {
                         list_del_init(&grp->list);
@@ -399,12 +397,12 @@ int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
          * pages, accept the amount pinned so far and program only that.
          * User space knows how to deal with partially programmed buffers.
          */
-       if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) {
+       if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) {
                 ret = -ENOMEM;
                 goto bail;
         }
  
-       pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
+       pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages);
         if (pinned <= 0) {
                 ret = pinned;
                 goto bail;
@@ -559,7 +557,7 @@ nomem:
          * for example), unpin all unmapped pages so we can pin them nex time.
          */
         if (mapped_pages != pinned) {
-               hfi1_release_user_pages(current->mm, &pages[mapped_pages],
+               hfi1_release_user_pages(fd->mm, &pages[mapped_pages],
                                         pinned - mapped_pages,
                                         false);
                 fd->tid_n_pinned -= pinned - mapped_pages;
@@ -829,7 +827,6 @@ static int set_rcvarray_entry(struct file *fp, unsigned long vaddr,
         struct hfi1_ctxtdata *uctxt = fd->uctxt;
         struct tid_rb_node *node;
         struct hfi1_devdata *dd = uctxt->dd;
-       struct rb_root *root = &fd->tid_rb_root;
         dma_addr_t phys;
  
         /*
@@ -861,10 +858,10 @@ static int set_rcvarray_entry(struct file *fp, unsigned long vaddr,
         node->freed = false;
         memcpy(node->pages, pages, sizeof(struct page *) * npages);
  
-       if (HFI1_CAP_IS_USET(TID_UNMAP))
-               ret = mmu_rb_insert(root, &node->mmu);
+       if (!fd->handler)
+               ret = tid_rb_insert(fd, &node->mmu);
         else
-               ret = hfi1_mmu_rb_insert(root, &node->mmu);
+               ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu);
  
         if (ret) {
                 hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
@@ -904,19 +901,19 @@ static int unprogram_rcvarray(struct file *fp, u32 tidinfo,
         node = fd->entry_to_rb[rcventry];
         if (!node || node->rcventry != (uctxt->expected_base + rcventry))
                 return -EBADF;
-       if (HFI1_CAP_IS_USET(TID_UNMAP))
-               mmu_rb_remove(&fd->tid_rb_root, &node->mmu, NULL);
-       else
-               hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu);
  
         if (grp)
                 *grp = node->grp;
-       clear_tid_node(fd, fd->subctxt, node);
+
+       if (!fd->handler)
+               cacheless_tid_rb_remove(fd, node);
+       else
+               hfi1_mmu_rb_remove(fd->handler, &node->mmu);
+
         return 0;
  }
  
-static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt,
-                          struct tid_rb_node *node)
+static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
  {
         struct hfi1_ctxtdata *uctxt = fd->uctxt;
         struct hfi1_devdata *dd = uctxt->dd;
@@ -934,7 +931,7 @@ static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt,
  
         pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len,
                          PCI_DMA_FROMDEVICE);
-       hfi1_release_user_pages(current->mm, node->pages, node->npages, true);
+       hfi1_release_user_pages(fd->mm, node->pages, node->npages, true);
         fd->tid_n_pinned -= node->npages;
  
         node->grp->used--;
@@ -949,12 +946,15 @@ static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt,
         kfree(node);
  }
  
+/*
+ * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
+ * clearing nodes in the non-cached case.
+ */
  static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
-                           struct exp_tid_set *set, struct rb_root *root)
+                           struct exp_tid_set *set,
+                           struct hfi1_filedata *fd)
  {
         struct tid_group *grp, *ptr;
-       struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata,
-                                               tid_rb_root);
         int i;
  
         list_for_each_entry_safe(grp, ptr, &set->list, list) {
@@ -969,22 +969,23 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
                                                           uctxt->expected_base];
                                 if (!node || node->rcventry != rcventry)
                                         continue;
-                               if (HFI1_CAP_IS_USET(TID_UNMAP))
-                                       mmu_rb_remove(&fd->tid_rb_root,
-                                                     &node->mmu, NULL);
-                               else
-                                       hfi1_mmu_rb_remove(&fd->tid_rb_root,
-                                                          &node->mmu);
-                               clear_tid_node(fd, -1, node);
+
+                               cacheless_tid_rb_remove(fd, node);
                         }
                 }
         }
  }
  
-static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
+/*
+ * Always return 0 from this function.  A non-zero return indicates that the
+ * remove operation will be called and that memory should be unpinned.
+ * However, the driver cannot unpin out from under PSM.  Instead, retain the
+ * memory (by returning 0) and inform PSM that the memory is going away.  PSM
+ * will call back later when it has removed the memory from its list.
+ */
+static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
  {
-       struct hfi1_filedata *fdata =
-               container_of(root, struct hfi1_filedata, tid_rb_root);
+       struct hfi1_filedata *fdata = arg;
         struct hfi1_ctxtdata *uctxt = fdata->uctxt;
         struct tid_rb_node *node =
                 container_of(mnode, struct tid_rb_node, mmu);
@@ -1025,10 +1026,9 @@ static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
         return 0;
  }
  
-static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node)
+static int tid_rb_insert(void *arg, struct mmu_rb_node *node)
  {
-       struct hfi1_filedata *fdata =
-               container_of(root, struct hfi1_filedata, tid_rb_root);
+       struct hfi1_filedata *fdata = arg;
         struct tid_rb_node *tnode =
                 container_of(node, struct tid_rb_node, mmu);
         u32 base = fdata->uctxt->expected_base;
@@ -1037,14 +1037,20 @@ static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node)
         return 0;
  }
  
-static void mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node,
-                         struct mm_struct *mm)
+static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
+                                   struct tid_rb_node *tnode)
  {
-       struct hfi1_filedata *fdata =
-               container_of(root, struct hfi1_filedata, tid_rb_root);
-       struct tid_rb_node *tnode =
-               container_of(node, struct tid_rb_node, mmu);
         u32 base = fdata->uctxt->expected_base;
  
         fdata->entry_to_rb[tnode->rcventry - base] = NULL;
+       clear_tid_node(fdata, tnode);
+}
+
+static void tid_rb_remove(void *arg, struct mmu_rb_node *node)
+{
+       struct hfi1_filedata *fdata = arg;
+       struct tid_rb_node *tnode =
+               container_of(node, struct tid_rb_node, mmu);
+
+       cacheless_tid_rb_remove(fdata, tnode);
  }
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c

index 88e10b5f55f154be7b97399de635f2fa24a162ff..20f4ddcac3b0f1c78d26c4ede74a5645f71bb641 100644 (file)
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -68,7 +68,8 @@ MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)");
   * could keeping caching buffers.
   *
   */
-bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages)
+bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm,
+                       u32 nlocked, u32 npages)
  {
         unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit,
                 size = (cache_size * (1UL << 20)); /* convert to bytes */
@@ -89,9 +90,9 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages)
         /* Convert to number of pages */
         size = DIV_ROUND_UP(size, PAGE_SIZE);
  
-       down_read(&current->mm->mmap_sem);
-       pinned = current->mm->pinned_vm;
-       up_read(&current->mm->mmap_sem);
+       down_read(&mm->mmap_sem);
+       pinned = mm->pinned_vm;
+       up_read(&mm->mmap_sem);
  
         /* First, check the absolute limit against all pinned pages. */
         if (pinned + npages >= ulimit && !can_lock)
@@ -100,8 +101,8 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages)
         return ((nlocked + npages) <= size) || can_lock;
  }
  
-int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable,
-                           struct page **pages)
+int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t npages,
+                           bool writable, struct page **pages)
  {
         int ret;
  
@@ -109,9 +110,9 @@ int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable,
         if (ret < 0)
                 return ret;
  
-       down_write(&current->mm->mmap_sem);
-       current->mm->pinned_vm += ret;
-       up_write(&current->mm->mmap_sem);
+       down_write(&mm->mmap_sem);
+       mm->pinned_vm += ret;
+       up_write(&mm->mmap_sem);
  
         return ret;
  }
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c

index 47ffd273ecbd7745d25067f4f8268301d89dde25..0ecf27903dc20f62ab2bd26c307d8be7afcfab5e 100644 (file)
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -145,7 +145,7 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12
  /* Last packet in the request */
  #define TXREQ_FLAGS_REQ_LAST_PKT BIT(0)
  
-#define SDMA_REQ_IN_USE     0
+/* SDMA request flag bits */
  #define SDMA_REQ_FOR_THREAD 1
  #define SDMA_REQ_SEND_DONE  2
  #define SDMA_REQ_HAVE_AHG   3
@@ -183,16 +183,18 @@ struct user_sdma_iovec {
         struct sdma_mmu_node *node;
  };
  
-#define SDMA_CACHE_NODE_EVICT 0
-
  struct sdma_mmu_node {
         struct mmu_rb_node rb;
-       struct list_head list;
         struct hfi1_user_sdma_pkt_q *pq;
         atomic_t refcount;
         struct page **pages;
         unsigned npages;
-       unsigned long flags;
+};
+
+/* evict operation argument */
+struct evict_data {
+       u32 cleared;    /* count evicted so far */
+       u32 target;     /* target count to evict */
  };
  
  struct user_sdma_request {
@@ -305,14 +307,16 @@ static int defer_packet_queue(
         unsigned seq);
  static void activate_packet_queue(struct iowait *, int);
  static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long);
-static int sdma_rb_insert(struct rb_root *, struct mmu_rb_node *);
-static void sdma_rb_remove(struct rb_root *, struct mmu_rb_node *,
-                          struct mm_struct *);
-static int sdma_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
+static int sdma_rb_insert(void *, struct mmu_rb_node *);
+static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
+                        void *arg2, bool *stop);
+static void sdma_rb_remove(void *, struct mmu_rb_node *);
+static int sdma_rb_invalidate(void *, struct mmu_rb_node *);
  
  static struct mmu_rb_ops sdma_rb_ops = {
         .filter = sdma_rb_filter,
         .insert = sdma_rb_insert,
+       .evict = sdma_rb_evict,
         .remove = sdma_rb_remove,
         .invalidate = sdma_rb_invalidate
  };
@@ -397,6 +401,11 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
         if (!pq->reqs)
                 goto pq_reqs_nomem;
  
+       memsize = BITS_TO_LONGS(hfi1_sdma_comp_ring_size) * sizeof(long);
+       pq->req_in_use = kzalloc(memsize, GFP_KERNEL);
+       if (!pq->req_in_use)
+               goto pq_reqs_no_in_use;
+
         INIT_LIST_HEAD(&pq->list);
         pq->dd = dd;
         pq->ctxt = uctxt->ctxt;
@@ -405,9 +414,8 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
         pq->state = SDMA_PKT_Q_INACTIVE;
         atomic_set(&pq->n_reqs, 0);
         init_waitqueue_head(&pq->wait);
-       pq->sdma_rb_root = RB_ROOT;
-       INIT_LIST_HEAD(&pq->evict);
-       spin_lock_init(&pq->evict_lock);
+       atomic_set(&pq->n_locked, 0);
+       pq->mm = fd->mm;
  
         iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
                     activate_packet_queue, NULL);
@@ -437,7 +445,8 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
         cq->nentries = hfi1_sdma_comp_ring_size;
         fd->cq = cq;
  
-       ret = hfi1_mmu_rb_register(&pq->sdma_rb_root, &sdma_rb_ops);
+       ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq,
+                                  &pq->handler);
         if (ret) {
                 dd_dev_err(dd, "Failed to register with MMU %d", ret);
                 goto done;
@@ -453,6 +462,8 @@ cq_comps_nomem:
  cq_nomem:
         kmem_cache_destroy(pq->txreq_cache);
  pq_txreq_nomem:
+       kfree(pq->req_in_use);
+pq_reqs_no_in_use:
         kfree(pq->reqs);
  pq_reqs_nomem:
         kfree(pq);
@@ -472,8 +483,9 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
         hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
                   uctxt->ctxt, fd->subctxt);
         pq = fd->pq;
-       hfi1_mmu_rb_unregister(&pq->sdma_rb_root);
         if (pq) {
+               if (pq->handler)
+                       hfi1_mmu_rb_unregister(pq->handler);
                 spin_lock_irqsave(&uctxt->sdma_qlock, flags);
                 if (!list_empty(&pq->list))
                         list_del_init(&pq->list);
@@ -484,6 +496,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
                         pq->wait,
                         (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
                 kfree(pq->reqs);
+               kfree(pq->req_in_use);
                 kmem_cache_destroy(pq->txreq_cache);
                 kfree(pq);
                 fd->pq = NULL;
@@ -496,10 +509,31 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
         return 0;
  }
  
+static u8 dlid_to_selector(u16 dlid)
+{
+       static u8 mapping[256];
+       static int initialized;
+       static u8 next;
+       int hash;
+
+       if (!initialized) {
+               memset(mapping, 0xFF, 256);
+               initialized = 1;
+       }
+
+       hash = ((dlid >> 8) ^ dlid) & 0xFF;
+       if (mapping[hash] == 0xFF) {
+               mapping[hash] = next;
+               next = (next + 1) & 0x7F;
+       }
+
+       return mapping[hash];
+}
+
  int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
                                    unsigned long dim, unsigned long *count)
  {
-       int ret = 0, i = 0;
+       int ret = 0, i;
         struct hfi1_filedata *fd = fp->private_data;
         struct hfi1_ctxtdata *uctxt = fd->uctxt;
         struct hfi1_user_sdma_pkt_q *pq = fd->pq;
@@ -511,6 +545,8 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
         struct user_sdma_request *req;
         u8 opcode, sc, vl;
         int req_queued = 0;
+       u16 dlid;
+       u8 selector;
  
         if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
                 hfi1_cdbg(
@@ -529,30 +565,48 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
  
         trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
                                      (u16 *)&info);
-       if (cq->comps[info.comp_idx].status == QUEUED ||
-           test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) {
-               hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
-                         dd->unit, uctxt->ctxt, fd->subctxt,
-                         info.comp_idx);
-               return -EBADSLT;
+
+       if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
+               hfi1_cdbg(SDMA,
+                         "[%u:%u:%u:%u] Invalid comp index",
+                         dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
+               return -EINVAL;
         }
+
+       /*
+        * Sanity check the header io vector count.  Need at least 1 vector
+        * (header) and cannot be larger than the actual io vector count.
+        */
+       if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
+               hfi1_cdbg(SDMA,
+                         "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
+                         dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
+                         req_iovcnt(info.ctrl), dim);
+               return -EINVAL;
+       }
+
         if (!info.fragsize) {
                 hfi1_cdbg(SDMA,
                           "[%u:%u:%u:%u] Request does not specify fragsize",
                           dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
                 return -EINVAL;
         }
+
+       /* Try to claim the request. */
+       if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
+               hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
+                         dd->unit, uctxt->ctxt, fd->subctxt,
+                         info.comp_idx);
+               return -EBADSLT;
+       }
         /*
-        * We've done all the safety checks that we can up to this point,
-        * "allocate" the request entry.
+        * All safety checks have been done and this request has been claimed.
          */
         hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
                   uctxt->ctxt, fd->subctxt, info.comp_idx);
         req = pq->reqs + info.comp_idx;
         memset(req, 0, sizeof(*req));
-       /* Mark the request as IN_USE before we start filling it in. */
-       set_bit(SDMA_REQ_IN_USE, &req->flags);
-       req->data_iovs = req_iovcnt(info.ctrl) - 1;
+       req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
         req->pq = pq;
         req->cq = cq;
         req->status = -1;
@@ -560,13 +614,22 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
  
         memcpy(&req->info, &info, sizeof(info));
  
-       if (req_opcode(info.ctrl) == EXPECTED)
+       if (req_opcode(info.ctrl) == EXPECTED) {
+               /* expected must have a TID info and at least one data vector */
+               if (req->data_iovs < 2) {
+                       SDMA_DBG(req,
+                                "Not enough vectors for expected request");
+                       ret = -EINVAL;
+                       goto free_req;
+               }
                 req->data_iovs--;
+       }
  
         if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
                 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
                          MAX_VECTORS_PER_REQ);
-               return -EINVAL;
+               ret = -EINVAL;
+               goto free_req;
         }
         /* Copy the header from the user buffer */
         ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
@@ -634,7 +697,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
         idx++;
  
         /* Save all the IO vector structures */
-       while (i < req->data_iovs) {
+       for (i = 0; i < req->data_iovs; i++) {
                 INIT_LIST_HEAD(&req->iovs[i].list);
                 memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
                 ret = pin_vector_pages(req, &req->iovs[i]);
@@ -642,7 +705,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
                         req->status = ret;
                         goto free_req;
                 }
-               req->data_len += req->iovs[i++].iov.iov_len;
+               req->data_len += req->iovs[i].iov.iov_len;
         }
         SDMA_DBG(req, "total data length %u", req->data_len);
  
@@ -686,9 +749,13 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
                 idx++;
         }
  
+       dlid = be16_to_cpu(req->hdr.lrh[1]);
+       selector = dlid_to_selector(dlid);
+
         /* Have to select the engine */
         req->sde = sdma_select_engine_vl(dd,
-                                        (u32)(uctxt->ctxt + fd->subctxt),
+                                        (u32)(uctxt->ctxt + fd->subctxt +
+                                              selector),
                                          vl);
         if (!req->sde || !sdma_running(req->sde)) {
                 ret = -ECOMM;
@@ -766,14 +833,21 @@ static inline u32 compute_data_length(struct user_sdma_request *req,
          * The size of the data of the first packet is in the header
          * template. However, it includes the header and ICRC, which need
          * to be subtracted.
+        * The minimum representable packet data length in a header is 4 bytes,
+        * therefore, when the data length request is less than 4 bytes, there's
+        * only one packet, and the packet data length is equal to that of the
+        * request data length.
          * The size of the remaining packets is the minimum of the frag
          * size (MTU) or remaining data in the request.
          */
         u32 len;
  
         if (!req->seqnum) {
-               len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
-                      (sizeof(tx->hdr) - 4));
+               if (req->data_len < sizeof(u32))
+                       len = req->data_len;
+               else
+                       len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
+                              (sizeof(tx->hdr) - 4));
         } else if (req_opcode(req->info.ctrl) == EXPECTED) {
                 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
                         PAGE_SIZE;
@@ -803,6 +877,13 @@ static inline u32 compute_data_length(struct user_sdma_request *req,
         return len;
  }
  
+static inline u32 pad_len(u32 len)
+{
+       if (len & (sizeof(u32) - 1))
+               len += sizeof(u32) - (len & (sizeof(u32) - 1));
+       return len;
+}
+
  static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
  {
         /* (Size of complete header - size of PBC) + 4B ICRC + data length */
@@ -894,7 +975,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
                 if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
                         if (!req->seqnum) {
                                 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
-                               u32 lrhlen = get_lrh_len(req->hdr, datalen);
+                               u32 lrhlen = get_lrh_len(req->hdr,
+                                                        pad_len(datalen));
                                 /*
                                  * Copy the request header into the tx header
                                  * because the HW needs a cacheline-aligned
@@ -1048,39 +1130,24 @@ static inline int num_user_pages(const struct iovec *iov)
  
  static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
  {
-       u32 cleared = 0;
-       struct sdma_mmu_node *node, *ptr;
-       struct list_head to_evict = LIST_HEAD_INIT(to_evict);
-
-       spin_lock(&pq->evict_lock);
-       list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) {
-               /* Make sure that no one is still using the node. */
-               if (!atomic_read(&node->refcount)) {
-                       set_bit(SDMA_CACHE_NODE_EVICT, &node->flags);
-                       list_del_init(&node->list);
-                       list_add(&node->list, &to_evict);
-                       cleared += node->npages;
-                       if (cleared >= npages)
-                               break;
-               }
-       }
-       spin_unlock(&pq->evict_lock);
-
-       list_for_each_entry_safe(node, ptr, &to_evict, list)
-               hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
+       struct evict_data evict_data;
  
-       return cleared;
+       evict_data.cleared = 0;
+       evict_data.target = npages;
+       hfi1_mmu_rb_evict(pq->handler, &evict_data);
+       return evict_data.cleared;
  }
  
  static int pin_vector_pages(struct user_sdma_request *req,
-                           struct user_sdma_iovec *iovec) {
+                           struct user_sdma_iovec *iovec)
+{
         int ret = 0, pinned, npages, cleared;
         struct page **pages;
         struct hfi1_user_sdma_pkt_q *pq = req->pq;
         struct sdma_mmu_node *node = NULL;
         struct mmu_rb_node *rb_node;
  
-       rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root,
+       rb_node = hfi1_mmu_rb_extract(pq->handler,
                                       (unsigned long)iovec->iov.iov_base,
                                       iovec->iov.iov_len);
         if (rb_node && !IS_ERR(rb_node))
@@ -1096,7 +1163,6 @@ static int pin_vector_pages(struct user_sdma_request *req,
                 node->rb.addr = (unsigned long)iovec->iov.iov_base;
                 node->pq = pq;
                 atomic_set(&node->refcount, 0);
-               INIT_LIST_HEAD(&node->list);
         }
  
         npages = num_user_pages(&iovec->iov);
@@ -1111,28 +1177,14 @@ static int pin_vector_pages(struct user_sdma_request *req,
  
                 npages -= node->npages;
  
-               /*
-                * If rb_node is NULL, it means that this is brand new node
-                * and, therefore not on the eviction list.
-                * If, however, the rb_node is non-NULL, it means that the
-                * node is already in RB tree and, therefore on the eviction
-                * list (nodes are unconditionally inserted in the eviction
-                * list). In that case, we have to remove the node prior to
-                * calling the eviction function in order to prevent it from
-                * freeing this node.
-                */
-               if (rb_node) {
-                       spin_lock(&pq->evict_lock);
-                       list_del_init(&node->list);
-                       spin_unlock(&pq->evict_lock);
-               }
  retry:
-               if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) {
+               if (!hfi1_can_pin_pages(pq->dd, pq->mm,
+                                       atomic_read(&pq->n_locked), npages)) {
                         cleared = sdma_cache_evict(pq, npages);
                         if (cleared >= npages)
                                 goto retry;
                 }
-               pinned = hfi1_acquire_user_pages(
+               pinned = hfi1_acquire_user_pages(pq->mm,
                         ((unsigned long)iovec->iov.iov_base +
                          (node->npages * PAGE_SIZE)), npages, 0,
                         pages + node->npages);
@@ -1142,7 +1194,7 @@ retry:
                         goto bail;
                 }
                 if (pinned != npages) {
-                       unpin_vector_pages(current->mm, pages, node->npages,
+                       unpin_vector_pages(pq->mm, pages, node->npages,
                                            pinned);
                         ret = -EFAULT;
                         goto bail;
@@ -1152,28 +1204,22 @@ retry:
                 node->pages = pages;
                 node->npages += pinned;
                 npages = node->npages;
-               spin_lock(&pq->evict_lock);
-               list_add(&node->list, &pq->evict);
-               pq->n_locked += pinned;
-               spin_unlock(&pq->evict_lock);
+               atomic_add(pinned, &pq->n_locked);
         }
         iovec->pages = node->pages;
         iovec->npages = npages;
         iovec->node = node;
  
-       ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
+       ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
         if (ret) {
-               spin_lock(&pq->evict_lock);
-               if (!list_empty(&node->list))
-                       list_del(&node->list);
-               pq->n_locked -= node->npages;
-               spin_unlock(&pq->evict_lock);
+               atomic_sub(node->npages, &pq->n_locked);
+               iovec->node = NULL;
                 goto bail;
         }
         return 0;
  bail:
         if (rb_node)
-               unpin_vector_pages(current->mm, node->pages, 0, node->npages);
+               unpin_vector_pages(pq->mm, node->pages, 0, node->npages);
         kfree(node);
         return ret;
  }
@@ -1181,7 +1227,7 @@ bail:
  static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
                                unsigned start, unsigned npages)
  {
-       hfi1_release_user_pages(mm, pages + start, npages, 0);
+       hfi1_release_user_pages(mm, pages + start, npages, false);
         kfree(pages);
  }
  
@@ -1192,16 +1238,14 @@ static int check_header_template(struct user_sdma_request *req,
         /*
          * Perform safety checks for any type of packet:
          *    - transfer size is multiple of 64bytes
-        *    - packet length is multiple of 4bytes
-        *    - entire request length is multiple of 4bytes
+        *    - packet length is multiple of 4 bytes
          *    - packet length is not larger than MTU size
          *
          * These checks are only done for the first packet of the
          * transfer since the header is "given" to us by user space.
          * For the remainder of the packets we compute the values.
          */
-       if (req->info.fragsize % PIO_BLOCK_SIZE ||
-           lrhlen & 0x3 || req->data_len & 0x3  ||
+       if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
             lrhlen > get_lrh_len(*hdr, req->info.fragsize))
                 return -EINVAL;
  
@@ -1263,7 +1307,7 @@ static int set_txreq_header(struct user_sdma_request *req,
         struct hfi1_pkt_header *hdr = &tx->hdr;
         u16 pbclen;
         int ret;
-       u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen);
+       u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
  
         /* Copy the header template to the request before modification */
         memcpy(hdr, &req->hdr, sizeof(*hdr));
@@ -1374,7 +1418,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
         struct hfi1_user_sdma_pkt_q *pq = req->pq;
         struct hfi1_pkt_header *hdr = &req->hdr;
         u16 pbclen = le16_to_cpu(hdr->pbc[0]);
-       u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len);
+       u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(len));
  
         if (PBC2LRH(pbclen) != lrhlen) {
                 /* PBC.PbcLengthDWs */
@@ -1534,14 +1578,14 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
                                 continue;
  
                         if (unpin)
-                               hfi1_mmu_rb_remove(&req->pq->sdma_rb_root,
+                               hfi1_mmu_rb_remove(req->pq->handler,
                                                    &node->rb);
                         else
                                 atomic_dec(&node->refcount);
                 }
         }
         kfree(req->tids);
-       clear_bit(SDMA_REQ_IN_USE, &req->flags);
+       clear_bit(req->info.comp_idx, req->pq->req_in_use);
  }
  
  static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
@@ -1564,7 +1608,7 @@ static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
         return (bool)(node->addr == addr);
  }
  
-static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
+static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
  {
         struct sdma_mmu_node *node =
                 container_of(mnode, struct sdma_mmu_node, rb);
@@ -1573,48 +1617,45 @@ static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
         return 0;
  }
  
-static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode,
-                          struct mm_struct *mm)
+/*
+ * Return 1 to remove the node from the rb tree and call the remove op.
+ *
+ * Called with the rb tree lock held.
+ */
+static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
+                        void *evict_arg, bool *stop)
+{
+       struct sdma_mmu_node *node =
+               container_of(mnode, struct sdma_mmu_node, rb);
+       struct evict_data *evict_data = evict_arg;
+
+       /* is this node still being used? */
+       if (atomic_read(&node->refcount))
+               return 0; /* keep this node */
+
+       /* this node will be evicted, add its pages to our count */
+       evict_data->cleared += node->npages;
+
+       /* have enough pages been cleared? */
+       if (evict_data->cleared >= evict_data->target)
+               *stop = true;
+
+       return 1; /* remove this node */
+}
+
+static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
  {
         struct sdma_mmu_node *node =
                 container_of(mnode, struct sdma_mmu_node, rb);
  
-       spin_lock(&node->pq->evict_lock);
-       /*
-        * We've been called by the MMU notifier but this node has been
-        * scheduled for eviction. The eviction function will take care
-        * of freeing this node.
-        * We have to take the above lock first because we are racing
-        * against the setting of the bit in the eviction function.
-        */
-       if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) {
-               spin_unlock(&node->pq->evict_lock);
-               return;
-       }
+       atomic_sub(node->npages, &node->pq->n_locked);
  
-       if (!list_empty(&node->list))
-               list_del(&node->list);
-       node->pq->n_locked -= node->npages;
-       spin_unlock(&node->pq->evict_lock);
+       unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages);
  
-       /*
-        * If mm is set, we are being called by the MMU notifier and we
-        * should not pass a mm_struct to unpin_vector_page(). This is to
-        * prevent a deadlock when hfi1_release_user_pages() attempts to
-        * take the mmap_sem, which the MMU notifier has already taken.
-        */
-       unpin_vector_pages(mm ? NULL : current->mm, node->pages, 0,
-                          node->npages);
-       /*
-        * If called by the MMU notifier, we have to adjust the pinned
-        * page count ourselves.
-        */
-       if (mm)
-               mm->pinned_vm -= node->npages;
         kfree(node);
  }
  
-static int sdma_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
+static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
  {
         struct sdma_mmu_node *node =
                 container_of(mnode, struct sdma_mmu_node, rb);
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h

index b9240e351161c9dad40e253c9f045e99b912a858..39001714f5517ecc8c2e74a4769a589d7e6ee06a 100644 (file)
--- a/drivers/infiniband/hw/hfi1/user_sdma.h
+++ b/drivers/infiniband/hw/hfi1/user_sdma.h
@@ -63,14 +63,14 @@ struct hfi1_user_sdma_pkt_q {
         struct hfi1_devdata *dd;
         struct kmem_cache *txreq_cache;
         struct user_sdma_request *reqs;
+       unsigned long *req_in_use;
         struct iowait busy;
         unsigned state;
         wait_queue_head_t wait;
         unsigned long unpinned;
-       struct rb_root sdma_rb_root;
-       u32 n_locked;
-       struct list_head evict;
-       spinlock_t evict_lock; /* protect evict and n_locked */
+       struct mmu_rb_handler *handler;
+       atomic_t n_locked;
+       struct mm_struct *mm;
  };
  
  struct hfi1_user_sdma_comp_q {
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c

index 849c4b9399d428a7930b5ac1bfd4ecd7730ee1ac..2b359540901db3dd4c42cad718cc938ff7139d6a 100644 (file)
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -306,7 +306,10 @@ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
         [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
         [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
         [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
-       [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+       [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
+       [IB_WR_SEND_WITH_INV] = IB_WC_SEND,
+       [IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV,
+       [IB_WR_REG_MR] = IB_WC_REG_MR
  };
  
  /*
@@ -378,6 +381,8 @@ static const opcode_handler opcode_handler_tbl[256] = {
         [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
         [IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
         [IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = &hfi1_rc_rcv,
         /* UC */
         [IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
         [IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
@@ -540,19 +545,15 @@ void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release)
  /*
   * Make sure the QP is ready and able to accept the given opcode.
   */
-static inline int qp_ok(int opcode, struct hfi1_packet *packet)
+static inline opcode_handler qp_ok(int opcode, struct hfi1_packet *packet)
  {
-       struct hfi1_ibport *ibp;
-
         if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
-               goto dropit;
+               return NULL;
         if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
             (opcode == IB_OPCODE_CNP))
-               return 1;
-dropit:
-       ibp = &packet->rcd->ppd->ibport_data;
-       ibp->rvp.n_pkt_drops++;
-       return 0;
+               return opcode_handler_tbl[opcode];
+
+       return NULL;
  }
  
  /**
@@ -571,6 +572,7 @@ void hfi1_ib_rcv(struct hfi1_packet *packet)
         struct hfi1_pportdata *ppd = rcd->ppd;
         struct hfi1_ibport *ibp = &ppd->ibport_data;
         struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+       opcode_handler packet_handler;
         unsigned long flags;
         u32 qp_num;
         int lnh;
@@ -616,8 +618,11 @@ void hfi1_ib_rcv(struct hfi1_packet *packet)
                 list_for_each_entry_rcu(p, &mcast->qp_list, list) {
                         packet->qp = p->qp;
                         spin_lock_irqsave(&packet->qp->r_lock, flags);
-                       if (likely((qp_ok(opcode, packet))))
-                               opcode_handler_tbl[opcode](packet);
+                       packet_handler = qp_ok(opcode, packet);
+                       if (likely(packet_handler))
+                               packet_handler(packet);
+                       else
+                               ibp->rvp.n_pkt_drops++;
                         spin_unlock_irqrestore(&packet->qp->r_lock, flags);
                 }
                 /*
@@ -634,8 +639,11 @@ void hfi1_ib_rcv(struct hfi1_packet *packet)
                         goto drop;
                 }
                 spin_lock_irqsave(&packet->qp->r_lock, flags);
-               if (likely((qp_ok(opcode, packet))))
-                       opcode_handler_tbl[opcode](packet);
+               packet_handler = qp_ok(opcode, packet);
+               if (likely(packet_handler))
+                       packet_handler(packet);
+               else
+                       ibp->rvp.n_pkt_drops++;
                 spin_unlock_irqrestore(&packet->qp->r_lock, flags);
                 rcu_read_unlock();
         }
@@ -808,19 +816,19 @@ static int build_verbs_tx_desc(
         struct rvt_sge_state *ss,
         u32 length,
         struct verbs_txreq *tx,
-       struct ahg_ib_header *ahdr,
+       struct hfi1_ahg_info *ahg_info,
         u64 pbc)
  {
         int ret = 0;
-       struct hfi1_pio_header *phdr = &tx->phdr;
+       struct hfi1_sdma_header *phdr = &tx->phdr;
         u16 hdrbytes = tx->hdr_dwords << 2;
  
-       if (!ahdr->ahgcount) {
+       if (!ahg_info->ahgcount) {
                 ret = sdma_txinit_ahg(
                         &tx->txreq,
-                       ahdr->tx_flags,
+                       ahg_info->tx_flags,
                         hdrbytes + length,
-                       ahdr->ahgidx,
+                       ahg_info->ahgidx,
                         0,
                         NULL,
                         0,
@@ -838,11 +846,11 @@ static int build_verbs_tx_desc(
         } else {
                 ret = sdma_txinit_ahg(
                         &tx->txreq,
-                       ahdr->tx_flags,
+                       ahg_info->tx_flags,
                         length,
-                       ahdr->ahgidx,
-                       ahdr->ahgcount,
-                       ahdr->ahgdesc,
+                       ahg_info->ahgidx,
+                       ahg_info->ahgcount,
+                       ahg_info->ahgdesc,
                         hdrbytes,
                         verbs_sdma_complete);
                 if (ret)
@@ -860,7 +868,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
                         u64 pbc)
  {
         struct hfi1_qp_priv *priv = qp->priv;
-       struct ahg_ib_header *ahdr = priv->s_hdr;
+       struct hfi1_ahg_info *ahg_info = priv->s_ahg;
         u32 hdrwords = qp->s_hdrwords;
         struct rvt_sge_state *ss = qp->s_cur_sge;
         u32 len = qp->s_cur_size;
@@ -888,7 +896,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
                                          plen);
                 }
                 tx->wqe = qp->s_wqe;
-               ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc);
+               ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahg_info, pbc);
                 if (unlikely(ret))
                         goto bail_build;
         }
@@ -1291,19 +1299,24 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
  static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
  {
         struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+       u16 ver = dd->dc8051_ver;
  
         memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
  
+       rdi->dparms.props.fw_ver = ((u64)(dc8051_ver_maj(ver)) << 16) |
+                                   (u64)dc8051_ver_min(ver);
         rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
                         IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
                         IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
-                       IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+                       IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE |
+                       IB_DEVICE_MEM_MGT_EXTENSIONS;
         rdi->dparms.props.page_size_cap = PAGE_SIZE;
         rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
         rdi->dparms.props.vendor_part_id = dd->pcidev->device;
         rdi->dparms.props.hw_ver = dd->minrev;
         rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid;
-       rdi->dparms.props.max_mr_size = ~0ULL;
+       rdi->dparms.props.max_mr_size = U64_MAX;
+       rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
         rdi->dparms.props.max_qp = hfi1_max_qps;
         rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
         rdi->dparms.props.max_sge = hfi1_max_sges;
@@ -1567,6 +1580,17 @@ static void init_ibport(struct hfi1_pportdata *ppd)
         RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
  }
  
+static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str,
+                               size_t str_len)
+{
+       struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+       struct hfi1_ibdev *dev = dev_from_rdi(rdi);
+       u16 ver = dd_from_dev(dev)->dc8051_ver;
+
+       snprintf(str, str_len, "%u.%u", dc8051_ver_maj(ver),
+                dc8051_ver_min(ver));
+}
+
  /**
   * hfi1_register_ib_device - register our device with the infiniband core
   * @dd: the device data structure
@@ -1613,6 +1637,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
  
         /* keep process mad in the driver */
         ibdev->process_mad = hfi1_process_mad;
+       ibdev->get_dev_fw_str = hfi1_get_dev_fw_str;
  
         strncpy(ibdev->node_desc, init_utsname()->nodename,
                 sizeof(ibdev->node_desc));
@@ -1680,6 +1705,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
         dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
         dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
  
+       /* post send table */
+       dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
+
         ppd = dd->pport;
         for (i = 0; i < dd->num_pports; i++, ppd++)
                 rvt_init_port(&dd->verbs_dev.rdi,
@@ -1730,8 +1758,7 @@ void hfi1_cnp_rcv(struct hfi1_packet *packet)
         struct rvt_qp *qp = packet->qp;
         u32 lqpn, rqpn = 0;
         u16 rlid = 0;
-       u8 sl, sc5, sc4_bit, svc_type;
-       bool sc4_set = has_sc4_bit(packet);
+       u8 sl, sc5, svc_type;
  
         switch (packet->qp->ibqp.qp_type) {
         case IB_QPT_UC:
@@ -1754,9 +1781,7 @@ void hfi1_cnp_rcv(struct hfi1_packet *packet)
                 return;
         }
  
-       sc4_bit = sc4_set << 4;
-       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-       sc5 |= sc4_bit;
+       sc5 = hdr2sc((struct hfi1_message_header *)hdr, packet->rhf);
         sl = ibp->sc_to_sl[sc5];
         lqpn = qp->ibqp.qp_num;
  
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h

index 488356775627809599b6b7702b86d56ef396a2a9..d1b101c5482898b59db32ea895e433ee9bc1ce92 100644 (file)
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -178,16 +178,14 @@ struct hfi1_ib_header {
         } u;
  } __packed;
  
-struct ahg_ib_header {
-       struct sdma_engine *sde;
+struct hfi1_ahg_info {
         u32 ahgdesc[2];
         u16 tx_flags;
         u8 ahgcount;
         u8 ahgidx;
-       struct hfi1_ib_header ibh;
  };
  
-struct hfi1_pio_header {
+struct hfi1_sdma_header {
         __le64 pbc;
         struct hfi1_ib_header hdr;
  } __packed;
@@ -197,7 +195,7 @@ struct hfi1_pio_header {
   * pair is made common
   */
  struct hfi1_qp_priv {
-       struct ahg_ib_header *s_hdr;              /* next header to send */
+       struct hfi1_ahg_info *s_ahg;              /* ahg info for next header */
         struct sdma_engine *s_sde;                /* current sde */
         struct send_context *s_sendcontext;       /* current sendcontext */
         u8 s_sc;                                  /* SC[0..4] for next packet */
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h

index a1d6e0807f97161f5336120ab7e6d0ffdb175b4e..5660897593ba4444f43661cba45d1ee06102ad20 100644 (file)
--- a/drivers/infiniband/hw/hfi1/verbs_txreq.h
+++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h
@@ -56,7 +56,7 @@
  #include "iowait.h"
  
  struct verbs_txreq {
-       struct hfi1_pio_header  phdr;
+       struct hfi1_sdma_header phdr;
         struct sdma_txreq       txreq;
         struct rvt_qp           *qp;
         struct rvt_swqe         *wqe;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c

index d2fa7251696077a3fb3f7b8cec5955bd9348a603..5026dc79978a7c2ea2a1040861913a3aeaf8d7d9 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -1567,12 +1567,12 @@ static enum i40iw_status_code i40iw_del_multiple_qhash(
                 ret = i40iw_manage_qhash(iwdev, cm_info,
                                          I40IW_QHASH_TYPE_TCP_SYN,
                                          I40IW_QHASH_MANAGE_TYPE_DELETE, NULL, false);
-               kfree(child_listen_node);
-               cm_parent_listen_node->cm_core->stats_listen_nodes_destroyed++;
                 i40iw_debug(&iwdev->sc_dev,
                             I40IW_DEBUG_CM,
                             "freed pointer = %p\n",
                             child_listen_node);
+               kfree(child_listen_node);
+               cm_parent_listen_node->cm_core->stats_listen_nodes_destroyed++;
         }
         spin_unlock_irqrestore(&iwdev->cm_core.listen_list_lock, flags);
  
diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h

index bd942da91a2797dcfac81dbadd4aa6dbc32b3643..2fac1db0e0a0070a5a7ab626a75c6564dec1f0a6 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_d.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_d.h
@@ -1557,6 +1557,9 @@ enum i40iw_alignment {
  #define I40IW_RING_MOVE_TAIL(_ring) \
         (_ring).tail = ((_ring).tail + 1) % (_ring).size
  
+#define I40IW_RING_MOVE_HEAD_NOCHECK(_ring) \
+       (_ring).head = ((_ring).head + 1) % (_ring).size
+
  #define I40IW_RING_MOVE_TAIL_BY_COUNT(_ring, _count) \
         (_ring).tail = ((_ring).tail + (_count)) % (_ring).size
  
diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c

index e9c6e82af9c7a07a8bb3d256007f297fcd61aa15..c62d354f78102c6e31ff8b65aae4a1be9c021948 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_puda.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c
@@ -1025,6 +1025,8 @@ static void  i40iw_ieq_compl_pfpdu(struct i40iw_puda_rsrc *ieq,
         u16 txoffset, bufoffset;
  
         buf = i40iw_puda_get_listbuf(pbufl);
+       if (!buf)
+               return;
         nextseqnum = buf->seqnum + fpdu_len;
         txbuf->totallen = buf->hdrlen + fpdu_len;
         txbuf->data = (u8 *)txbuf->mem.va + buf->hdrlen;
@@ -1048,6 +1050,8 @@ static void  i40iw_ieq_compl_pfpdu(struct i40iw_puda_rsrc *ieq,
                 fpdu_len -= buf->datalen;
                 i40iw_puda_ret_bufpool(ieq, buf);
                 buf = i40iw_puda_get_listbuf(pbufl);
+               if (!buf)
+                       return;
                 bufoffset = (u16)(buf->data - (u8 *)buf->mem.va);
         } while (1);
  
diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h

index 16cc61720b530e5879ba509ab8ad43e714fc738c..2b1a04e9ca3c0f1f34480a77ec45252172acaa3f 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_type.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_type.h
@@ -667,7 +667,7 @@ struct i40iw_tcp_offload_info {
         bool time_stamp;
         u8 cwnd_inc_limit;
         bool drop_ooo_seg;
-       bool dup_ack_thresh;
+       u8 dup_ack_thresh;
         u8 ttl;
         u8 src_mac_addr_idx;
         bool avoid_stretch_ack;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c b/drivers/infiniband/hw/i40iw/i40iw_uk.c

index e35faea88c134ca777c2914a2dcfaf87afc77a25..4d28c3cb03cc3b45bd198b2f52ed7a3507dd78b5 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_uk.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_uk.c
@@ -291,9 +291,9 @@ static enum i40iw_status_code i40iw_rdma_write(struct i40iw_qp_uk *qp,
  
         i40iw_set_fragment(wqe, 0, op_info->lo_sg_list);
  
-       for (i = 1; i < op_info->num_lo_sges; i++) {
-               byte_off = 32 + (i - 1) * 16;
+       for (i = 1, byte_off = 32; i < op_info->num_lo_sges; i++) {
                 i40iw_set_fragment(wqe, byte_off, &op_info->lo_sg_list[i]);
+               byte_off += 16;
         }
  
         wmb(); /* make sure WQE is populated before valid bit is set */
@@ -401,9 +401,9 @@ static enum i40iw_status_code i40iw_send(struct i40iw_qp_uk *qp,
  
         i40iw_set_fragment(wqe, 0, op_info->sg_list);
  
-       for (i = 1; i < op_info->num_sges; i++) {
-               byte_off = 32 + (i - 1) * 16;
+       for (i = 1, byte_off = 32; i < op_info->num_sges; i++) {
                 i40iw_set_fragment(wqe, byte_off, &op_info->sg_list[i]);
+               byte_off += 16;
         }
  
         wmb(); /* make sure WQE is populated before valid bit is set */
@@ -685,9 +685,9 @@ static enum i40iw_status_code i40iw_post_receive(struct i40iw_qp_uk *qp,
  
         i40iw_set_fragment(wqe, 0, info->sg_list);
  
-       for (i = 1; i < info->num_sges; i++) {
-               byte_off = 32 + (i - 1) * 16;
+       for (i = 1, byte_off = 32; i < info->num_sges; i++) {
                 i40iw_set_fragment(wqe, byte_off, &info->sg_list[i]);
+               byte_off += 16;
         }
  
         wmb(); /* make sure WQE is populated before valid bit is set */
@@ -753,8 +753,7 @@ static enum i40iw_status_code i40iw_cq_post_entries(struct i40iw_cq_uk *cq,
   * @post_cq: update cq tail
   */
  static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
-                                                      struct i40iw_cq_poll_info *info,
-                                                      bool post_cq)
+                                                      struct i40iw_cq_poll_info *info)
  {
         u64 comp_ctx, qword0, qword2, qword3, wqe_qword;
         u64 *cqe, *sw_wqe;
@@ -762,7 +761,6 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
         struct i40iw_ring *pring = NULL;
         u32 wqe_idx, q_type, array_idx = 0;
         enum i40iw_status_code ret_code = 0;
-       enum i40iw_status_code ret_code2 = 0;
         bool move_cq_head = true;
         u8 polarity;
         u8 addl_wqes = 0;
@@ -870,19 +868,14 @@ exit:
                         move_cq_head = false;
  
         if (move_cq_head) {
-               I40IW_RING_MOVE_HEAD(cq->cq_ring, ret_code2);
-
-               if (ret_code2 && !ret_code)
-                       ret_code = ret_code2;
+               I40IW_RING_MOVE_HEAD_NOCHECK(cq->cq_ring);
  
                 if (I40IW_RING_GETCURRENT_HEAD(cq->cq_ring) == 0)
                         cq->polarity ^= 1;
  
-               if (post_cq) {
-                       I40IW_RING_MOVE_TAIL(cq->cq_ring);
-                       set_64bit_val(cq->shadow_area, 0,
-                                     I40IW_RING_GETCURRENT_HEAD(cq->cq_ring));
-               }
+               I40IW_RING_MOVE_TAIL(cq->cq_ring);
+               set_64bit_val(cq->shadow_area, 0,
+                             I40IW_RING_GETCURRENT_HEAD(cq->cq_ring));
         } else {
                 if (info->is_srq)
                         return ret_code;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_user.h b/drivers/infiniband/hw/i40iw/i40iw_user.h

index 4627646fe8cde4976681df7cfecdfe103aab80a2..276bcefffd7ea54f6fefe0c44c423b11d8ff1b71 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_user.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_user.h
@@ -327,7 +327,7 @@ struct i40iw_cq_ops {
         void (*iw_cq_request_notification)(struct i40iw_cq_uk *,
                                            enum i40iw_completion_notify);
         enum i40iw_status_code (*iw_cq_poll_completion)(struct i40iw_cq_uk *,
-                                                       struct i40iw_cq_poll_info *, bool);
+                                                       struct i40iw_cq_poll_info *);
         enum i40iw_status_code (*iw_cq_post_entries)(struct i40iw_cq_uk *, u8 count);
         void (*iw_cq_clean)(void *, struct i40iw_cq_uk *);
  };
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c

index 283b64c942eebfea6378ee8f4ad5206e6f549876..2360338877bf68ca4a809d153f83a64326fa467c 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -529,7 +529,7 @@ static int i40iw_setup_kmode_qp(struct i40iw_device *iwdev,
                 status = i40iw_get_wqe_shift(rq_size, ukinfo->max_rq_frag_cnt, 0, &rqshift);
  
         if (status)
-               return -ENOSYS;
+               return -ENOMEM;
  
         sqdepth = sq_size << sqshift;
         rqdepth = rq_size << rqshift;
@@ -671,7 +671,7 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
         iwqp->ctx_info.qp_compl_ctx = (uintptr_t)qp;
  
         if (init_attr->qp_type != IB_QPT_RC) {
-               err_code = -ENOSYS;
+               err_code = -EINVAL;
                 goto error;
         }
         if (iwdev->push_mode)
@@ -1840,6 +1840,7 @@ struct ib_mr *i40iw_reg_phys_mr(struct ib_pd *pd,
         iwmr->ibmr.lkey = stag;
         iwmr->page_cnt = 1;
         iwmr->pgaddrmem[0]  = addr;
+       iwmr->length = size;
         status = i40iw_hwreg_mr(iwdev, iwmr, access);
         if (status) {
                 i40iw_free_stag(iwdev, stag);
@@ -1863,7 +1864,7 @@ static struct ib_mr *i40iw_get_dma_mr(struct ib_pd *pd, int acc)
  {
         u64 kva = 0;
  
-       return i40iw_reg_phys_mr(pd, 0, 0xffffffffffULL, acc, &kva);
+       return i40iw_reg_phys_mr(pd, 0, 0, acc, &kva);
  }
  
  /**
@@ -1974,18 +1975,6 @@ static ssize_t i40iw_show_rev(struct device *dev,
         return sprintf(buf, "%x\n", hw_rev);
  }
  
-/**
- * i40iw_show_fw_ver
- */
-static ssize_t i40iw_show_fw_ver(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-       u32 firmware_version = I40IW_FW_VERSION;
-
-       return sprintf(buf, "%u.%u\n", firmware_version,
-                      (firmware_version & 0x000000ff));
-}
-
  /**
   * i40iw_show_hca
   */
@@ -2006,13 +1995,11 @@ static ssize_t i40iw_show_board(struct device *dev,
  }
  
  static DEVICE_ATTR(hw_rev, S_IRUGO, i40iw_show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, i40iw_show_fw_ver, NULL);
  static DEVICE_ATTR(hca_type, S_IRUGO, i40iw_show_hca, NULL);
  static DEVICE_ATTR(board_id, S_IRUGO, i40iw_show_board, NULL);
  
  static struct device_attribute *i40iw_dev_attributes[] = {
         &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
         &dev_attr_hca_type,
         &dev_attr_board_id
  };
@@ -2091,8 +2078,12 @@ static int i40iw_post_send(struct ib_qp *ibqp,
                                 ret = ukqp->ops.iw_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false);
                         }
  
-                       if (ret)
-                               err = -EIO;
+                       if (ret) {
+                               if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+                                       err = -ENOMEM;
+                               else
+                                       err = -EINVAL;
+                       }
                         break;
                 case IB_WR_RDMA_WRITE:
                         info.op_type = I40IW_OP_TYPE_RDMA_WRITE;
@@ -2113,8 +2104,12 @@ static int i40iw_post_send(struct ib_qp *ibqp,
                                 ret = ukqp->ops.iw_rdma_write(ukqp, &info, false);
                         }
  
-                       if (ret)
-                               err = -EIO;
+                       if (ret) {
+                               if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+                                       err = -ENOMEM;
+                               else
+                                       err = -EINVAL;
+                       }
                         break;
                 case IB_WR_RDMA_READ_WITH_INV:
                         inv_stag = true;
@@ -2132,15 +2127,19 @@ static int i40iw_post_send(struct ib_qp *ibqp,
                         info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey;
                         info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length;
                         ret = ukqp->ops.iw_rdma_read(ukqp, &info, inv_stag, false);
-                       if (ret)
-                               err = -EIO;
+                       if (ret) {
+                               if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+                                       err = -ENOMEM;
+                               else
+                                       err = -EINVAL;
+                       }
                         break;
                 case IB_WR_LOCAL_INV:
                         info.op_type = I40IW_OP_TYPE_INV_STAG;
                         info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey;
                         ret = ukqp->ops.iw_stag_local_invalidate(ukqp, &info, true);
                         if (ret)
-                               err = -EIO;
+                               err = -ENOMEM;
                         break;
                 case IB_WR_REG_MR:
                 {
@@ -2174,7 +2173,7 @@ static int i40iw_post_send(struct ib_qp *ibqp,
  
                         ret = dev->iw_priv_qp_ops->iw_mr_fast_register(&iwqp->sc_qp, &info, true);
                         if (ret)
-                               err = -EIO;
+                               err = -ENOMEM;
                         break;
                 }
                 default:
@@ -2214,6 +2213,7 @@ static int i40iw_post_recv(struct ib_qp *ibqp,
         struct i40iw_sge sg_list[I40IW_MAX_WQ_FRAGMENT_COUNT];
         enum i40iw_status_code ret = 0;
         unsigned long flags;
+       int err = 0;
  
         iwqp = (struct i40iw_qp *)ibqp;
         ukqp = &iwqp->sc_qp.qp_uk;
@@ -2228,6 +2228,10 @@ static int i40iw_post_recv(struct ib_qp *ibqp,
                 ret = ukqp->ops.iw_post_receive(ukqp, &post_recv);
                 if (ret) {
                         i40iw_pr_err(" post_recv err %d\n", ret);
+                       if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+                               err = -ENOMEM;
+                       else
+                               err = -EINVAL;
                         *bad_wr = ib_wr;
                         goto out;
                 }
@@ -2235,9 +2239,7 @@ static int i40iw_post_recv(struct ib_qp *ibqp,
         }
   out:
         spin_unlock_irqrestore(&iwqp->lock, flags);
-       if (ret)
-               return -ENOSYS;
-       return 0;
+       return err;
  }
  
  /**
@@ -2264,7 +2266,7 @@ static int i40iw_poll_cq(struct ib_cq *ibcq,
  
         spin_lock_irqsave(&iwcq->lock, flags);
         while (cqe_count < num_entries) {
-               ret = ukcq->ops.iw_cq_poll_completion(ukcq, &cq_poll_info, true);
+               ret = ukcq->ops.iw_cq_poll_completion(ukcq, &cq_poll_info);
                 if (ret == I40IW_ERR_QUEUE_EMPTY) {
                         break;
                 } else if (ret == I40IW_ERR_QUEUE_DESTROYED) {
@@ -2437,6 +2439,15 @@ static const char * const i40iw_hw_stat_names[] = {
                 "iwRdmaInv"
  };
  
+static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str,
+                                size_t str_len)
+{
+       u32 firmware_version = I40IW_FW_VERSION;
+
+       snprintf(str, str_len, "%u.%u", firmware_version,
+                      (firmware_version & 0x000000ff));
+}
+
  /**
   * i40iw_alloc_hw_stats - Allocate a hw stats structure
   * @ibdev: device pointer from stack
@@ -2528,7 +2539,7 @@ static int i40iw_modify_port(struct ib_device *ibdev,
                              int port_modify_mask,
                              struct ib_port_modify *props)
  {
-       return 0;
+       return -ENOSYS;
  }
  
  /**
@@ -2660,6 +2671,7 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
         memcpy(iwibdev->ibdev.iwcm->ifname, netdev->name,
                sizeof(iwibdev->ibdev.iwcm->ifname));
         iwibdev->ibdev.get_port_immutable   = i40iw_port_immutable;
+       iwibdev->ibdev.get_dev_fw_str       = i40iw_get_dev_fw_str;
         iwibdev->ibdev.poll_cq = i40iw_poll_cq;
         iwibdev->ibdev.req_notify_cq = i40iw_req_notify_cq;
         iwibdev->ibdev.post_send = i40iw_post_send;
@@ -2723,7 +2735,7 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev)
  
         iwdev->iwibdev = i40iw_init_rdma_device(iwdev);
         if (!iwdev->iwibdev)
-               return -ENOSYS;
+               return -ENOMEM;
         iwibdev = iwdev->iwibdev;
  
         ret = ib_register_device(&iwibdev->ibdev, NULL);
@@ -2748,5 +2760,5 @@ error:
         kfree(iwdev->iwibdev->ibdev.iwcm);
         iwdev->iwibdev->ibdev.iwcm = NULL;
         ib_dealloc_device(&iwdev->iwibdev->ibdev);
-       return -ENOSYS;
+       return ret;
  }
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c

index 9f8b516eb2b0f2412452685d21df360075245f0c..d6fc8a6e8c3324fcef4d9081167cce9cc998f13b 100644 (file)
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -288,7 +288,7 @@ static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
         if (cq->resize_buf)
                 return -EBUSY;
  
-       cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+       cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL);
         if (!cq->resize_buf)
                 return -ENOMEM;
  
@@ -316,7 +316,7 @@ static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq
         if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
                 return -EFAULT;
  
-       cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+       cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL);
         if (!cq->resize_buf)
                 return -ENOMEM;
  
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c

index 42a46078d7d52755109784e769f29c2e5532bbca..2af44c2de2624a75d90a675727b32f914ec53e05 100644 (file)
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2025,16 +2025,6 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr,
         return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device);
  }
  
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
-                          char *buf)
-{
-       struct mlx4_ib_dev *dev =
-               container_of(device, struct mlx4_ib_dev, ib_dev.dev);
-       return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32),
-                      (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
-                      (int) dev->dev->caps.fw_ver & 0xffff);
-}
-
  static ssize_t show_rev(struct device *device, struct device_attribute *attr,
                         char *buf)
  {
@@ -2053,17 +2043,204 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr,
  }
  
  static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
-static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
  static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
  static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
  
  static struct device_attribute *mlx4_class_attributes[] = {
         &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
         &dev_attr_hca_type,
         &dev_attr_board_id
  };
  
+struct diag_counter {
+       const char *name;
+       u32 offset;
+};
+
+#define DIAG_COUNTER(_name, _offset)                   \
+       { .name = #_name, .offset = _offset }
+
+static const struct diag_counter diag_basic[] = {
+       DIAG_COUNTER(rq_num_lle, 0x00),
+       DIAG_COUNTER(sq_num_lle, 0x04),
+       DIAG_COUNTER(rq_num_lqpoe, 0x08),
+       DIAG_COUNTER(sq_num_lqpoe, 0x0C),
+       DIAG_COUNTER(rq_num_lpe, 0x18),
+       DIAG_COUNTER(sq_num_lpe, 0x1C),
+       DIAG_COUNTER(rq_num_wrfe, 0x20),
+       DIAG_COUNTER(sq_num_wrfe, 0x24),
+       DIAG_COUNTER(sq_num_mwbe, 0x2C),
+       DIAG_COUNTER(sq_num_bre, 0x34),
+       DIAG_COUNTER(sq_num_rire, 0x44),
+       DIAG_COUNTER(rq_num_rire, 0x48),
+       DIAG_COUNTER(sq_num_rae, 0x4C),
+       DIAG_COUNTER(rq_num_rae, 0x50),
+       DIAG_COUNTER(sq_num_roe, 0x54),
+       DIAG_COUNTER(sq_num_tree, 0x5C),
+       DIAG_COUNTER(sq_num_rree, 0x64),
+       DIAG_COUNTER(rq_num_rnr, 0x68),
+       DIAG_COUNTER(sq_num_rnr, 0x6C),
+       DIAG_COUNTER(rq_num_oos, 0x100),
+       DIAG_COUNTER(sq_num_oos, 0x104),
+};
+
+static const struct diag_counter diag_ext[] = {
+       DIAG_COUNTER(rq_num_dup, 0x130),
+       DIAG_COUNTER(sq_num_to, 0x134),
+};
+
+static const struct diag_counter diag_device_only[] = {
+       DIAG_COUNTER(num_cqovf, 0x1A0),
+       DIAG_COUNTER(rq_num_udsdprd, 0x118),
+};
+
+static struct rdma_hw_stats *mlx4_ib_alloc_hw_stats(struct ib_device *ibdev,
+                                                   u8 port_num)
+{
+       struct mlx4_ib_dev *dev = to_mdev(ibdev);
+       struct mlx4_ib_diag_counters *diag = dev->diag_counters;
+
+       if (!diag[!!port_num].name)
+               return NULL;
+
+       return rdma_alloc_hw_stats_struct(diag[!!port_num].name,
+                                         diag[!!port_num].num_counters,
+                                         RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx4_ib_get_hw_stats(struct ib_device *ibdev,
+                               struct rdma_hw_stats *stats,
+                               u8 port, int index)
+{
+       struct mlx4_ib_dev *dev = to_mdev(ibdev);
+       struct mlx4_ib_diag_counters *diag = dev->diag_counters;
+       u32 hw_value[ARRAY_SIZE(diag_device_only) +
+               ARRAY_SIZE(diag_ext) + ARRAY_SIZE(diag_basic)] = {};
+       int ret;
+       int i;
+
+       ret = mlx4_query_diag_counters(dev->dev,
+                                      MLX4_OP_MOD_QUERY_TRANSPORT_CI_ERRORS,
+                                      diag[!!port].offset, hw_value,
+                                      diag[!!port].num_counters, port);
+
+       if (ret)
+               return ret;
+
+       for (i = 0; i < diag[!!port].num_counters; i++)
+               stats->value[i] = hw_value[i];
+
+       return diag[!!port].num_counters;
+}
+
+static int __mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev,
+                                        const char ***name,
+                                        u32 **offset,
+                                        u32 *num,
+                                        bool port)
+{
+       u32 num_counters;
+
+       num_counters = ARRAY_SIZE(diag_basic);
+
+       if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT)
+               num_counters += ARRAY_SIZE(diag_ext);
+
+       if (!port)
+               num_counters += ARRAY_SIZE(diag_device_only);
+
+       *name = kcalloc(num_counters, sizeof(**name), GFP_KERNEL);
+       if (!*name)
+               return -ENOMEM;
+
+       *offset = kcalloc(num_counters, sizeof(**offset), GFP_KERNEL);
+       if (!*offset)
+               goto err_name;
+
+       *num = num_counters;
+
+       return 0;
+
+err_name:
+       kfree(*name);
+       return -ENOMEM;
+}
+
+static void mlx4_ib_fill_diag_counters(struct mlx4_ib_dev *ibdev,
+                                      const char **name,
+                                      u32 *offset,
+                                      bool port)
+{
+       int i;
+       int j;
+
+       for (i = 0, j = 0; i < ARRAY_SIZE(diag_basic); i++, j++) {
+               name[i] = diag_basic[i].name;
+               offset[i] = diag_basic[i].offset;
+       }
+
+       if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT) {
+               for (i = 0; i < ARRAY_SIZE(diag_ext); i++, j++) {
+                       name[j] = diag_ext[i].name;
+                       offset[j] = diag_ext[i].offset;
+               }
+       }
+
+       if (!port) {
+               for (i = 0; i < ARRAY_SIZE(diag_device_only); i++, j++) {
+                       name[j] = diag_device_only[i].name;
+                       offset[j] = diag_device_only[i].offset;
+               }
+       }
+}
+
+static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev)
+{
+       struct mlx4_ib_diag_counters *diag = ibdev->diag_counters;
+       int i;
+       int ret;
+       bool per_port = !!(ibdev->dev->caps.flags2 &
+               MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT);
+
+       for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) {
+               /* i == 1 means we are building port counters */
+               if (i && !per_port)
+                       continue;
+
+               ret = __mlx4_ib_alloc_diag_counters(ibdev, &diag[i].name,
+                                                   &diag[i].offset,
+                                                   &diag[i].num_counters, i);
+               if (ret)
+                       goto err_alloc;
+
+               mlx4_ib_fill_diag_counters(ibdev, diag[i].name,
+                                          diag[i].offset, i);
+       }
+
+       ibdev->ib_dev.get_hw_stats      = mlx4_ib_get_hw_stats;
+       ibdev->ib_dev.alloc_hw_stats    = mlx4_ib_alloc_hw_stats;
+
+       return 0;
+
+err_alloc:
+       if (i) {
+               kfree(diag[i - 1].name);
+               kfree(diag[i - 1].offset);
+       }
+
+       return ret;
+}
+
+static void mlx4_ib_diag_cleanup(struct mlx4_ib_dev *ibdev)
+{
+       int i;
+
+       for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) {
+               kfree(ibdev->diag_counters[i].offset);
+               kfree(ibdev->diag_counters[i].name);
+       }
+}
+
  #define MLX4_IB_INVALID_MAC    ((u64)-1)
  static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
                                struct net_device *dev,
@@ -2280,6 +2457,17 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
         return 0;
  }
  
+static void get_fw_ver_str(struct ib_device *device, char *str,
+                          size_t str_len)
+{
+       struct mlx4_ib_dev *dev =
+               container_of(device, struct mlx4_ib_dev, ib_dev);
+       snprintf(str, str_len, "%d.%d.%d",
+                (int) (dev->dev->caps.fw_ver >> 32),
+                (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
+                (int) dev->dev->caps.fw_ver & 0xffff);
+}
+
  static void *mlx4_ib_add(struct mlx4_dev *dev)
  {
         struct mlx4_ib_dev *ibdev;
@@ -2413,6 +2601,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
         ibdev->ib_dev.detach_mcast      = mlx4_ib_mcg_detach;
         ibdev->ib_dev.process_mad       = mlx4_ib_process_mad;
         ibdev->ib_dev.get_port_immutable = mlx4_port_immutable;
+       ibdev->ib_dev.get_dev_fw_str    = get_fw_ver_str;
         ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
  
         if (!mlx4_is_slave(ibdev->dev)) {
@@ -2555,9 +2744,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
         for (j = 1; j <= ibdev->dev->caps.num_ports; j++)
                 atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]);
  
-       if (ib_register_device(&ibdev->ib_dev, NULL))
+       if (mlx4_ib_alloc_diag_counters(ibdev))
                 goto err_steer_free_bitmap;
  
+       if (ib_register_device(&ibdev->ib_dev, NULL))
+               goto err_diag_counters;
+
         if (mlx4_ib_mad_init(ibdev))
                 goto err_reg;
  
@@ -2623,6 +2815,9 @@ err_mad:
  err_reg:
         ib_unregister_device(&ibdev->ib_dev);
  
+err_diag_counters:
+       mlx4_ib_diag_cleanup(ibdev);
+
  err_steer_free_bitmap:
         kfree(ibdev->ib_uc_qpns_bitmap);
  
@@ -2726,6 +2921,7 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
         mlx4_ib_close_sriov(ibdev);
         mlx4_ib_mad_cleanup(ibdev);
         ib_unregister_device(&ibdev->ib_dev);
+       mlx4_ib_diag_cleanup(ibdev);
         if (ibdev->iboe.nb.notifier_call) {
                 if (unregister_netdevice_notifier(&ibdev->iboe.nb))
                         pr_warn("failure unregistering notifier\n");
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h

index 29acda249612dd444ee42c37494674cc4e482790..7c5832ede4bd0cc213139900eeb3999532192858 100644 (file)
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -549,6 +549,14 @@ struct mlx4_ib_counters {
         u32                     default_counter;
  };
  
+#define MLX4_DIAG_COUNTERS_TYPES 2
+
+struct mlx4_ib_diag_counters {
+       const char **name;
+       u32 *offset;
+       u32 num_counters;
+};
+
  struct mlx4_ib_dev {
         struct ib_device        ib_dev;
         struct mlx4_dev        *dev;
@@ -585,6 +593,7 @@ struct mlx4_ib_dev {
         /* protect resources needed as part of reset flow */
         spinlock_t              reset_flow_resource_lock;
         struct list_head                qp_list;
+       struct mlx4_ib_diag_counters diag_counters[MLX4_DIAG_COUNTERS_TYPES];
  };
  
  struct ib_event_work {
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c

index 9c0e67bd2ba7326cf711e3eba0cf2e6bbd630d3e..308a358e5b46416f42285f0738200f505495bf5c 100644 (file)
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -424,6 +424,83 @@ static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe,
         item->key = be32_to_cpu(cqe->mkey);
  }
  
+static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries,
+                        struct ib_wc *wc, int *npolled)
+{
+       struct mlx5_ib_wq *wq;
+       unsigned int cur;
+       unsigned int idx;
+       int np;
+       int i;
+
+       wq = &qp->sq;
+       cur = wq->head - wq->tail;
+       np = *npolled;
+
+       if (cur == 0)
+               return;
+
+       for (i = 0;  i < cur && np < num_entries; i++) {
+               idx = wq->last_poll & (wq->wqe_cnt - 1);
+               wc->wr_id = wq->wrid[idx];
+               wc->status = IB_WC_WR_FLUSH_ERR;
+               wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
+               wq->tail++;
+               np++;
+               wc->qp = &qp->ibqp;
+               wc++;
+               wq->last_poll = wq->w_list[idx].next;
+       }
+       *npolled = np;
+}
+
+static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries,
+                        struct ib_wc *wc, int *npolled)
+{
+       struct mlx5_ib_wq *wq;
+       unsigned int cur;
+       int np;
+       int i;
+
+       wq = &qp->rq;
+       cur = wq->head - wq->tail;
+       np = *npolled;
+
+       if (cur == 0)
+               return;
+
+       for (i = 0;  i < cur && np < num_entries; i++) {
+               wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+               wc->status = IB_WC_WR_FLUSH_ERR;
+               wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
+               wq->tail++;
+               np++;
+               wc->qp = &qp->ibqp;
+               wc++;
+       }
+       *npolled = np;
+}
+
+static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries,
+                                struct ib_wc *wc, int *npolled)
+{
+       struct mlx5_ib_qp *qp;
+
+       *npolled = 0;
+       /* Find uncompleted WQEs belonging to that cq and retrun mmics ones */
+       list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) {
+               sw_send_comp(qp, num_entries, wc + *npolled, npolled);
+               if (*npolled >= num_entries)
+                       return;
+       }
+
+       list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) {
+               sw_recv_comp(qp, num_entries, wc + *npolled, npolled);
+               if (*npolled >= num_entries)
+                       return;
+       }
+}
+
  static int mlx5_poll_one(struct mlx5_ib_cq *cq,
                          struct mlx5_ib_qp **cur_qp,
                          struct ib_wc *wc)
@@ -594,12 +671,18 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
  {
         struct mlx5_ib_cq *cq = to_mcq(ibcq);
         struct mlx5_ib_qp *cur_qp = NULL;
+       struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+       struct mlx5_core_dev *mdev = dev->mdev;
         unsigned long flags;
         int soft_polled = 0;
         int npolled;
         int err = 0;
  
         spin_lock_irqsave(&cq->lock, flags);
+       if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+               mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
+               goto out;
+       }
  
         if (unlikely(!list_empty(&cq->wc_list)))
                 soft_polled = poll_soft_wc(cq, num_entries, wc);
@@ -612,7 +695,7 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
  
         if (npolled)
                 mlx5_cq_set_ci(&cq->mcq);
-
+out:
         spin_unlock_irqrestore(&cq->lock, flags);
  
         if (err == 0 || err == -EAGAIN)
@@ -843,6 +926,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
         cq->resize_buf = NULL;
         cq->resize_umem = NULL;
         cq->create_flags = attr->flags;
+       INIT_LIST_HEAD(&cq->list_send_qp);
+       INIT_LIST_HEAD(&cq->list_recv_qp);
  
         if (context) {
                 err = create_cq_user(dev, udata, context, cq, entries,
diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c

index 53e03c8ede79b16b097e84b0f7b9cb18de66c767..79e6309460dc509777324a9875e46573b7d38070 100644 (file)
--- a/drivers/infiniband/hw/mlx5/gsi.c
+++ b/drivers/infiniband/hw/mlx5/gsi.c
@@ -69,15 +69,6 @@ static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev)
         return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn);
  }
  
-static u32 next_outstanding(struct mlx5_ib_gsi_qp *gsi, u32 index)
-{
-       return ++index % gsi->cap.max_send_wr;
-}
-
-#define for_each_outstanding_wr(gsi, index) \
-       for (index = gsi->outstanding_ci; index != gsi->outstanding_pi; \
-            index = next_outstanding(gsi, index))
-
  /* Call with gsi->lock locked */
  static void generate_completions(struct mlx5_ib_gsi_qp *gsi)
  {
@@ -85,8 +76,9 @@ static void generate_completions(struct mlx5_ib_gsi_qp *gsi)
         struct mlx5_ib_gsi_wr *wr;
         u32 index;
  
-       for_each_outstanding_wr(gsi, index) {
-               wr = &gsi->outstanding_wrs[index];
+       for (index = gsi->outstanding_ci; index != gsi->outstanding_pi;
+            index++) {
+               wr = &gsi->outstanding_wrs[index % gsi->cap.max_send_wr];
  
                 if (!wr->completed)
                         break;
@@ -430,8 +422,9 @@ static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_gsi_qp *gsi,
                 return -ENOMEM;
         }
  
-       gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi];
-       gsi->outstanding_pi = next_outstanding(gsi, gsi->outstanding_pi);
+       gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi %
+                                      gsi->cap.max_send_wr];
+       gsi->outstanding_pi++;
  
         if (!wc) {
                 memset(&gsi_wr->wc, 0, sizeof(gsi_wr->wc));
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c

index dad63f038bb86edd66f806b42e2a62828a6bb085..a84bb766fc62874bc45303268c25b961e45cd4f1 100644 (file)
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -42,11 +42,13 @@
  #include <asm/pat.h>
  #endif
  #include <linux/sched.h>
+#include <linux/delay.h>
  #include <rdma/ib_user_verbs.h>
  #include <rdma/ib_addr.h>
  #include <rdma/ib_cache.h>
  #include <linux/mlx5/port.h>
  #include <linux/mlx5/vport.h>
+#include <linux/list.h>
  #include <rdma/ib_smi.h>
  #include <rdma/ib_umem.h>
  #include <linux/in.h>
@@ -457,8 +459,17 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
         int max_rq_sg;
         int max_sq_sg;
         u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
+       struct mlx5_ib_query_device_resp resp = {};
+       size_t resp_len;
+       u64 max_tso;
  
-       if (uhw->inlen || uhw->outlen)
+       resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
+       if (uhw->outlen && uhw->outlen < resp_len)
+               return -EINVAL;
+       else
+               resp.response_length = resp_len;
+
+       if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
                 return -EINVAL;
  
         memset(props, 0, sizeof(*props));
@@ -511,10 +522,21 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
         if (MLX5_CAP_GEN(mdev, block_lb_mc))
                 props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
  
-       if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
-           (MLX5_CAP_ETH(dev->mdev, csum_cap)))
+       if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) {
+               if (MLX5_CAP_ETH(mdev, csum_cap))
                         props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
  
+               if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
+                       max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
+                       if (max_tso) {
+                               resp.tso_caps.max_tso = 1 << max_tso;
+                               resp.tso_caps.supported_qpts |=
+                                       1 << IB_QPT_RAW_PACKET;
+                               resp.response_length += sizeof(resp.tso_caps);
+                       }
+               }
+       }
+
         if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
                 props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
                 props->device_cap_flags |= IB_DEVICE_UD_TSO;
@@ -576,6 +598,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
         if (!mlx5_core_is_pf(mdev))
                 props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
  
+       if (uhw->outlen) {
+               err = ib_copy_to_udata(uhw, &resp, resp.response_length);
+
+               if (err)
+                       return err;
+       }
+
         return 0;
  }
  
@@ -983,6 +1012,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
                         goto out_uars;
         }
  
+       INIT_LIST_HEAD(&context->vma_private_list);
         INIT_LIST_HEAD(&context->db_page_list);
         mutex_init(&context->db_page_mutex);
  
@@ -992,6 +1022,11 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
         if (field_avail(typeof(resp), cqe_version, udata->outlen))
                 resp.response_length += sizeof(resp.cqe_version);
  
+       if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
+               resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE;
+               resp.response_length += sizeof(resp.cmds_supp_uhw);
+       }
+
         /*
          * We don't want to expose information from the PCI bar that is located
          * after 4096 bytes, so if the arch only supports larger pages, let's
@@ -1006,8 +1041,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
                         offsetof(struct mlx5_init_seg, internal_timer_h) %
                         PAGE_SIZE;
                 resp.response_length += sizeof(resp.hca_core_clock_offset) +
-                                       sizeof(resp.reserved2) +
-                                       sizeof(resp.reserved3);
+                                       sizeof(resp.reserved2);
         }
  
         err = ib_copy_to_udata(udata, &resp, resp.response_length);
@@ -1086,6 +1120,125 @@ static int get_index(unsigned long offset)
         return get_arg(offset);
  }
  
+static void  mlx5_ib_vma_open(struct vm_area_struct *area)
+{
+       /* vma_open is called when a new VMA is created on top of our VMA.  This
+        * is done through either mremap flow or split_vma (usually due to
+        * mlock, madvise, munmap, etc.) We do not support a clone of the VMA,
+        * as this VMA is strongly hardware related.  Therefore we set the
+        * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
+        * calling us again and trying to do incorrect actions.  We assume that
+        * the original VMA size is exactly a single page, and therefore all
+        * "splitting" operation will not happen to it.
+        */
+       area->vm_ops = NULL;
+}
+
+static void  mlx5_ib_vma_close(struct vm_area_struct *area)
+{
+       struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data;
+
+       /* It's guaranteed that all VMAs opened on a FD are closed before the
+        * file itself is closed, therefore no sync is needed with the regular
+        * closing flow. (e.g. mlx5 ib_dealloc_ucontext)
+        * However need a sync with accessing the vma as part of
+        * mlx5_ib_disassociate_ucontext.
+        * The close operation is usually called under mm->mmap_sem except when
+        * process is exiting.
+        * The exiting case is handled explicitly as part of
+        * mlx5_ib_disassociate_ucontext.
+        */
+       mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data;
+
+       /* setting the vma context pointer to null in the mlx5_ib driver's
+        * private data, to protect a race condition in
+        * mlx5_ib_disassociate_ucontext().
+        */
+       mlx5_ib_vma_priv_data->vma = NULL;
+       list_del(&mlx5_ib_vma_priv_data->list);
+       kfree(mlx5_ib_vma_priv_data);
+}
+
+static const struct vm_operations_struct mlx5_ib_vm_ops = {
+       .open = mlx5_ib_vma_open,
+       .close = mlx5_ib_vma_close
+};
+
+static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
+                               struct mlx5_ib_ucontext *ctx)
+{
+       struct mlx5_ib_vma_private_data *vma_prv;
+       struct list_head *vma_head = &ctx->vma_private_list;
+
+       vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL);
+       if (!vma_prv)
+               return -ENOMEM;
+
+       vma_prv->vma = vma;
+       vma->vm_private_data = vma_prv;
+       vma->vm_ops =  &mlx5_ib_vm_ops;
+
+       list_add(&vma_prv->list, vma_head);
+
+       return 0;
+}
+
+static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+       int ret;
+       struct vm_area_struct *vma;
+       struct mlx5_ib_vma_private_data *vma_private, *n;
+       struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
+       struct task_struct *owning_process  = NULL;
+       struct mm_struct   *owning_mm       = NULL;
+
+       owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
+       if (!owning_process)
+               return;
+
+       owning_mm = get_task_mm(owning_process);
+       if (!owning_mm) {
+               pr_info("no mm, disassociate ucontext is pending task termination\n");
+               while (1) {
+                       put_task_struct(owning_process);
+                       usleep_range(1000, 2000);
+                       owning_process = get_pid_task(ibcontext->tgid,
+                                                     PIDTYPE_PID);
+                       if (!owning_process ||
+                           owning_process->state == TASK_DEAD) {
+                               pr_info("disassociate ucontext done, task was terminated\n");
+                               /* in case task was dead need to release the
+                                * task struct.
+                                */
+                               if (owning_process)
+                                       put_task_struct(owning_process);
+                               return;
+                       }
+               }
+       }
+
+       /* need to protect from a race on closing the vma as part of
+        * mlx5_ib_vma_close.
+        */
+       down_read(&owning_mm->mmap_sem);
+       list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
+                                list) {
+               vma = vma_private->vma;
+               ret = zap_vma_ptes(vma, vma->vm_start,
+                                  PAGE_SIZE);
+               WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__);
+               /* context going to be destroyed, should
+                * not access ops any more.
+                */
+               vma->vm_ops = NULL;
+               list_del(&vma_private->list);
+               kfree(vma_private);
+       }
+       up_read(&owning_mm->mmap_sem);
+       mmput(owning_mm);
+       put_task_struct(owning_process);
+}
+
  static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
  {
         switch (cmd) {
@@ -1101,8 +1254,10 @@ static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
  }
  
  static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
-                   struct vm_area_struct *vma, struct mlx5_uuar_info *uuari)
+                   struct vm_area_struct *vma,
+                   struct mlx5_ib_ucontext *context)
  {
+       struct mlx5_uuar_info *uuari = &context->uuari;
         int err;
         unsigned long idx;
         phys_addr_t pfn, pa;
@@ -1152,14 +1307,13 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
         mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd),
                     vma->vm_start, &pa);
  
-       return 0;
+       return mlx5_ib_set_vma_data(vma, context);
  }
  
  static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
  {
         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
-       struct mlx5_uuar_info *uuari = &context->uuari;
         unsigned long command;
         phys_addr_t pfn;
  
@@ -1168,7 +1322,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
         case MLX5_IB_MMAP_WC_PAGE:
         case MLX5_IB_MMAP_NC_PAGE:
         case MLX5_IB_MMAP_REGULAR_PAGE:
-               return uar_mmap(dev, command, vma, uuari);
+               return uar_mmap(dev, command, vma, context);
  
         case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
                 return -ENOSYS;
@@ -1331,6 +1485,32 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v,
                        &ib_spec->ipv4.val.dst_ip,
                        sizeof(ib_spec->ipv4.val.dst_ip));
                 break;
+       case IB_FLOW_SPEC_IPV6:
+               if (ib_spec->size != sizeof(ib_spec->ipv6))
+                       return -EINVAL;
+
+               MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
+                        ethertype, 0xffff);
+               MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
+                        ethertype, ETH_P_IPV6);
+
+               memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
+                                   src_ipv4_src_ipv6.ipv6_layout.ipv6),
+                      &ib_spec->ipv6.mask.src_ip,
+                      sizeof(ib_spec->ipv6.mask.src_ip));
+               memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
+                                   src_ipv4_src_ipv6.ipv6_layout.ipv6),
+                      &ib_spec->ipv6.val.src_ip,
+                      sizeof(ib_spec->ipv6.val.src_ip));
+               memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
+                                   dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+                      &ib_spec->ipv6.mask.dst_ip,
+                      sizeof(ib_spec->ipv6.mask.dst_ip));
+               memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
+                                   dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+                      &ib_spec->ipv6.val.dst_ip,
+                      sizeof(ib_spec->ipv6.val.dst_ip));
+               break;
         case IB_FLOW_SPEC_TCP:
                 if (ib_spec->size != sizeof(ib_spec->tcp_udp))
                         return -EINVAL;
@@ -1801,15 +1981,6 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr,
         return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
  }
  
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
-                          char *buf)
-{
-       struct mlx5_ib_dev *dev =
-               container_of(device, struct mlx5_ib_dev, ib_dev.dev);
-       return sprintf(buf, "%d.%d.%04d\n", fw_rev_maj(dev->mdev),
-                      fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
-}
-
  static ssize_t show_rev(struct device *device, struct device_attribute *attr,
                         char *buf)
  {
@@ -1828,7 +1999,6 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr,
  }
  
  static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
-static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
  static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
  static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
  static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
@@ -1836,7 +2006,6 @@ static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
  
  static struct device_attribute *mlx5_class_attributes[] = {
         &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
         &dev_attr_hca_type,
         &dev_attr_board_id,
         &dev_attr_fw_pages,
@@ -1854,6 +2023,65 @@ static void pkey_change_handler(struct work_struct *work)
         mutex_unlock(&ports->devr->mutex);
  }
  
+static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
+{
+       struct mlx5_ib_qp *mqp;
+       struct mlx5_ib_cq *send_mcq, *recv_mcq;
+       struct mlx5_core_cq *mcq;
+       struct list_head cq_armed_list;
+       unsigned long flags_qp;
+       unsigned long flags_cq;
+       unsigned long flags;
+
+       INIT_LIST_HEAD(&cq_armed_list);
+
+       /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
+       spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
+       list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
+               spin_lock_irqsave(&mqp->sq.lock, flags_qp);
+               if (mqp->sq.tail != mqp->sq.head) {
+                       send_mcq = to_mcq(mqp->ibqp.send_cq);
+                       spin_lock_irqsave(&send_mcq->lock, flags_cq);
+                       if (send_mcq->mcq.comp &&
+                           mqp->ibqp.send_cq->comp_handler) {
+                               if (!send_mcq->mcq.reset_notify_added) {
+                                       send_mcq->mcq.reset_notify_added = 1;
+                                       list_add_tail(&send_mcq->mcq.reset_notify,
+                                                     &cq_armed_list);
+                               }
+                       }
+                       spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
+               }
+               spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
+               spin_lock_irqsave(&mqp->rq.lock, flags_qp);
+               /* no handling is needed for SRQ */
+               if (!mqp->ibqp.srq) {
+                       if (mqp->rq.tail != mqp->rq.head) {
+                               recv_mcq = to_mcq(mqp->ibqp.recv_cq);
+                               spin_lock_irqsave(&recv_mcq->lock, flags_cq);
+                               if (recv_mcq->mcq.comp &&
+                                   mqp->ibqp.recv_cq->comp_handler) {
+                                       if (!recv_mcq->mcq.reset_notify_added) {
+                                               recv_mcq->mcq.reset_notify_added = 1;
+                                               list_add_tail(&recv_mcq->mcq.reset_notify,
+                                                             &cq_armed_list);
+                                       }
+                               }
+                               spin_unlock_irqrestore(&recv_mcq->lock,
+                                                      flags_cq);
+                       }
+               }
+               spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
+       }
+       /*At that point all inflight post send were put to be executed as of we
+        * lock/unlock above locks Now need to arm all involved CQs.
+        */
+       list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
+               mcq->comp(mcq);
+       }
+       spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
+}
+
  static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
                           enum mlx5_dev_event event, unsigned long param)
  {
@@ -1866,6 +2094,7 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
         case MLX5_DEV_EVENT_SYS_ERROR:
                 ibdev->ib_active = false;
                 ibev.event = IB_EVENT_DEVICE_FATAL;
+               mlx5_ib_handle_internal_error(ibdev);
                 break;
  
         case MLX5_DEV_EVENT_PORT_UP:
@@ -2272,6 +2501,15 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
         return 0;
  }
  
+static void get_dev_fw_str(struct ib_device *ibdev, char *str,
+                          size_t str_len)
+{
+       struct mlx5_ib_dev *dev =
+               container_of(ibdev, struct mlx5_ib_dev, ib_dev);
+       snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev),
+                      fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
+}
+
  static int mlx5_enable_roce(struct mlx5_ib_dev *dev)
  {
         int err;
@@ -2298,6 +2536,113 @@ static void mlx5_disable_roce(struct mlx5_ib_dev *dev)
         unregister_netdevice_notifier(&dev->roce.nb);
  }
  
+static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
+{
+       unsigned int i;
+
+       for (i = 0; i < dev->num_ports; i++)
+               mlx5_core_dealloc_q_counter(dev->mdev,
+                                           dev->port[i].q_cnt_id);
+}
+
+static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
+{
+       int i;
+       int ret;
+
+       for (i = 0; i < dev->num_ports; i++) {
+               ret = mlx5_core_alloc_q_counter(dev->mdev,
+                                               &dev->port[i].q_cnt_id);
+               if (ret) {
+                       mlx5_ib_warn(dev,
+                                    "couldn't allocate queue counter for port %d, err %d\n",
+                                    i + 1, ret);
+                       goto dealloc_counters;
+               }
+       }
+
+       return 0;
+
+dealloc_counters:
+       while (--i >= 0)
+               mlx5_core_dealloc_q_counter(dev->mdev,
+                                           dev->port[i].q_cnt_id);
+
+       return ret;
+}
+
+static const char * const names[] = {
+       "rx_write_requests",
+       "rx_read_requests",
+       "rx_atomic_requests",
+       "out_of_buffer",
+       "out_of_sequence",
+       "duplicate_request",
+       "rnr_nak_retry_err",
+       "packet_seq_err",
+       "implied_nak_seq_err",
+       "local_ack_timeout_err",
+};
+
+static const size_t stats_offsets[] = {
+       MLX5_BYTE_OFF(query_q_counter_out, rx_write_requests),
+       MLX5_BYTE_OFF(query_q_counter_out, rx_read_requests),
+       MLX5_BYTE_OFF(query_q_counter_out, rx_atomic_requests),
+       MLX5_BYTE_OFF(query_q_counter_out, out_of_buffer),
+       MLX5_BYTE_OFF(query_q_counter_out, out_of_sequence),
+       MLX5_BYTE_OFF(query_q_counter_out, duplicate_request),
+       MLX5_BYTE_OFF(query_q_counter_out, rnr_nak_retry_err),
+       MLX5_BYTE_OFF(query_q_counter_out, packet_seq_err),
+       MLX5_BYTE_OFF(query_q_counter_out, implied_nak_seq_err),
+       MLX5_BYTE_OFF(query_q_counter_out, local_ack_timeout_err),
+};
+
+static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
+                                                   u8 port_num)
+{
+       BUILD_BUG_ON(ARRAY_SIZE(names) != ARRAY_SIZE(stats_offsets));
+
+       /* We support only per port stats */
+       if (port_num == 0)
+               return NULL;
+
+       return rdma_alloc_hw_stats_struct(names, ARRAY_SIZE(names),
+                                         RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
+                               struct rdma_hw_stats *stats,
+                               u8 port, int index)
+{
+       struct mlx5_ib_dev *dev = to_mdev(ibdev);
+       int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
+       void *out;
+       __be32 val;
+       int ret;
+       int i;
+
+       if (!port || !stats)
+               return -ENOSYS;
+
+       out = mlx5_vzalloc(outlen);
+       if (!out)
+               return -ENOMEM;
+
+       ret = mlx5_core_query_q_counter(dev->mdev,
+                                       dev->port[port - 1].q_cnt_id, 0,
+                                       out, outlen);
+       if (ret)
+               goto free;
+
+       for (i = 0; i < ARRAY_SIZE(names); i++) {
+               val = *(__be32 *)(out + stats_offsets[i]);
+               stats->value[i] = (u64)be32_to_cpu(val);
+       }
+free:
+       kvfree(out);
+       return ARRAY_SIZE(names);
+}
+
  static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
  {
         struct mlx5_ib_dev *dev;
@@ -2320,10 +2665,15 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
  
         dev->mdev = mdev;
  
+       dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port),
+                           GFP_KERNEL);
+       if (!dev->port)
+               goto err_dealloc;
+
         rwlock_init(&dev->roce.netdev_lock);
         err = get_port_caps(dev);
         if (err)
-               goto err_dealloc;
+               goto err_free_port;
  
         if (mlx5_use_mad_ifc(dev))
                 get_ext_port_caps(dev);
@@ -2418,6 +2768,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
         dev->ib_dev.map_mr_sg           = mlx5_ib_map_mr_sg;
         dev->ib_dev.check_mr_status     = mlx5_ib_check_mr_status;
         dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
+       dev->ib_dev.get_dev_fw_str      = get_dev_fw_str;
         if (mlx5_core_is_pf(mdev)) {
                 dev->ib_dev.get_vf_config       = mlx5_ib_get_vf_config;
                 dev->ib_dev.set_vf_link_state   = mlx5_ib_set_vf_link_state;
@@ -2425,6 +2776,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
                 dev->ib_dev.set_vf_guid         = mlx5_ib_set_vf_guid;
         }
  
+       dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext;
+
         mlx5_ib_internal_fill_odp_caps(dev);
  
         if (MLX5_CAP_GEN(mdev, imaicl)) {
@@ -2435,6 +2788,12 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
                         (1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
         }
  
+       if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) &&
+           MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
+               dev->ib_dev.get_hw_stats        = mlx5_ib_get_hw_stats;
+               dev->ib_dev.alloc_hw_stats      = mlx5_ib_alloc_hw_stats;
+       }
+
         if (MLX5_CAP_GEN(mdev, xrc)) {
                 dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
                 dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
@@ -2447,9 +2806,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
             IB_LINK_LAYER_ETHERNET) {
                 dev->ib_dev.create_flow = mlx5_ib_create_flow;
                 dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
+               dev->ib_dev.create_wq    = mlx5_ib_create_wq;
+               dev->ib_dev.modify_wq    = mlx5_ib_modify_wq;
+               dev->ib_dev.destroy_wq   = mlx5_ib_destroy_wq;
+               dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
+               dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
                 dev->ib_dev.uverbs_ex_cmd_mask |=
                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
-                       (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
+                       (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) |
+                       (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
+                       (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
+                       (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
+                       (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
+                       (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
         }
         err = init_node_data(dev);
         if (err)
@@ -2457,6 +2826,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
  
         mutex_init(&dev->flow_db.lock);
         mutex_init(&dev->cap_mask_mutex);
+       INIT_LIST_HEAD(&dev->qp_list);
+       spin_lock_init(&dev->reset_flow_resource_lock);
  
         if (ll == IB_LINK_LAYER_ETHERNET) {
                 err = mlx5_enable_roce(dev);
@@ -2472,10 +2843,14 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
         if (err)
                 goto err_rsrc;
  
-       err = ib_register_device(&dev->ib_dev, NULL);
+       err = mlx5_ib_alloc_q_counters(dev);
         if (err)
                 goto err_odp;
  
+       err = ib_register_device(&dev->ib_dev, NULL);
+       if (err)
+               goto err_q_cnt;
+
         err = create_umr_res(dev);
         if (err)
                 goto err_dev;
@@ -2497,6 +2872,9 @@ err_umrc:
  err_dev:
         ib_unregister_device(&dev->ib_dev);
  
+err_q_cnt:
+       mlx5_ib_dealloc_q_counters(dev);
+
  err_odp:
         mlx5_ib_odp_remove_one(dev);
  
@@ -2507,6 +2885,9 @@ err_disable_roce:
         if (ll == IB_LINK_LAYER_ETHERNET)
                 mlx5_disable_roce(dev);
  
+err_free_port:
+       kfree(dev->port);
+
  err_dealloc:
         ib_dealloc_device((struct ib_device *)dev);
  
@@ -2519,11 +2900,13 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
  
         ib_unregister_device(&dev->ib_dev);
+       mlx5_ib_dealloc_q_counters(dev);
         destroy_umrc_res(dev);
         mlx5_ib_odp_remove_one(dev);
         destroy_dev_resources(&dev->devr);
         if (ll == IB_LINK_LAYER_ETHERNET)
                 mlx5_disable_roce(dev);
+       kfree(dev->port);
         ib_dealloc_device(&dev->ib_dev);
  }
  
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h

index c4a9825828bcfa4fe7144a515c797ec279832e3e..372385d0f99384785123ba5e24fcb2ceddcc520e 100644 (file)
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -105,6 +105,11 @@ enum {
         MLX5_CQE_VERSION_V1,
  };
  
+struct mlx5_ib_vma_private_data {
+       struct list_head list;
+       struct vm_area_struct *vma;
+};
+
  struct mlx5_ib_ucontext {
         struct ib_ucontext      ibucontext;
         struct list_head        db_page_list;
@@ -116,6 +121,7 @@ struct mlx5_ib_ucontext {
         u8                      cqe_version;
         /* Transport Domain number */
         u32                     tdn;
+       struct list_head        vma_private_list;
  };
  
  static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
@@ -217,12 +223,41 @@ struct mlx5_ib_wq {
         void                   *qend;
  };
  
+struct mlx5_ib_rwq {
+       struct ib_wq            ibwq;
+       u32                     rqn;
+       u32                     rq_num_pas;
+       u32                     log_rq_stride;
+       u32                     log_rq_size;
+       u32                     rq_page_offset;
+       u32                     log_page_size;
+       struct ib_umem          *umem;
+       size_t                  buf_size;
+       unsigned int            page_shift;
+       int                     create_type;
+       struct mlx5_db          db;
+       u32                     user_index;
+       u32                     wqe_count;
+       u32                     wqe_shift;
+       int                     wq_sig;
+};
+
  enum {
         MLX5_QP_USER,
         MLX5_QP_KERNEL,
         MLX5_QP_EMPTY
  };
  
+enum {
+       MLX5_WQ_USER,
+       MLX5_WQ_KERNEL
+};
+
+struct mlx5_ib_rwq_ind_table {
+       struct ib_rwq_ind_table ib_rwq_ind_tbl;
+       u32                     rqtn;
+};
+
  /*
   * Connect-IB can trigger up to four concurrent pagefaults
   * per-QP.
@@ -266,6 +301,10 @@ struct mlx5_ib_qp_trans {
         u8                      resp_depth;
  };
  
+struct mlx5_ib_rss_qp {
+       u32     tirn;
+};
+
  struct mlx5_ib_rq {
         struct mlx5_ib_qp_base base;
         struct mlx5_ib_wq       *rq;
@@ -294,6 +333,7 @@ struct mlx5_ib_qp {
         union {
                 struct mlx5_ib_qp_trans trans_qp;
                 struct mlx5_ib_raw_packet_qp raw_packet_qp;
+               struct mlx5_ib_rss_qp rss_qp;
         };
         struct mlx5_buf         buf;
  
@@ -340,6 +380,9 @@ struct mlx5_ib_qp {
         spinlock_t              disable_page_faults_lock;
         struct mlx5_ib_pfault   pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
  #endif
+       struct list_head        qps_list;
+       struct list_head        cq_recv_list;
+       struct list_head        cq_send_list;
  };
  
  struct mlx5_ib_cq_buf {
@@ -401,6 +444,8 @@ struct mlx5_ib_cq {
         struct mlx5_ib_cq_buf  *resize_buf;
         struct ib_umem         *resize_umem;
         int                     cqe_size;
+       struct list_head        list_send_qp;
+       struct list_head        list_recv_qp;
         u32                     create_flags;
         struct list_head        wc_list;
         enum ib_cq_notify_flags notify_flags;
@@ -546,6 +591,10 @@ struct mlx5_ib_resources {
         struct mutex    mutex;
  };
  
+struct mlx5_ib_port {
+       u16 q_cnt_id;
+};
+
  struct mlx5_roce {
         /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL
          * netdev pointer
@@ -581,6 +630,11 @@ struct mlx5_ib_dev {
         struct srcu_struct      mr_srcu;
  #endif
         struct mlx5_ib_flow_db  flow_db;
+       /* protect resources needed as part of reset flow */
+       spinlock_t              reset_flow_resource_lock;
+       struct list_head        qp_list;
+       /* Array with num_ports elements */
+       struct mlx5_ib_port     *port;
  };
  
  static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -628,6 +682,16 @@ static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp)
         return container_of(ibqp, struct mlx5_ib_qp, ibqp);
  }
  
+static inline struct mlx5_ib_rwq *to_mrwq(struct ib_wq *ibwq)
+{
+       return container_of(ibwq, struct mlx5_ib_rwq, ibwq);
+}
+
+static inline struct mlx5_ib_rwq_ind_table *to_mrwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
+{
+       return container_of(ib_rwq_ind_tbl, struct mlx5_ib_rwq_ind_table, ib_rwq_ind_tbl);
+}
+
  static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq)
  {
         return container_of(msrq, struct mlx5_ib_srq, msrq);
@@ -762,6 +826,16 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
  int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift);
  int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
                             struct ib_mr_status *mr_status);
+struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
+                               struct ib_wq_init_attr *init_attr,
+                               struct ib_udata *udata);
+int mlx5_ib_destroy_wq(struct ib_wq *wq);
+int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+                     u32 wq_attr_mask, struct ib_udata *udata);
+struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
+                                                     struct ib_rwq_ind_table_init_attr *init_attr,
+                                                     struct ib_udata *udata);
+int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
  
  #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
  extern struct workqueue_struct *mlx5_ib_page_fault_wq;
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c

index 8cf2ce50511f935e6043bea4ebaa4af3a980661c..4b021305c321bbf3513743d34d9c9f1e2e164210 100644 (file)
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1193,12 +1193,16 @@ error:
  
  static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
  {
+       struct mlx5_core_dev *mdev = dev->mdev;
         struct umr_common *umrc = &dev->umrc;
         struct mlx5_ib_umr_context umr_context;
         struct mlx5_umr_wr umrwr = {};
         struct ib_send_wr *bad;
         int err;
  
+       if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+               return 0;
+
         mlx5_ib_init_umr_context(&umr_context);
  
         umrwr.wr.wr_cqe = &umr_context.cqe;
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c

index ce0a7ab35a227c569deae7f6b322b4c84eb5dcec..0dd7d93cac95b9e2bba86749b18f83846fa36245 100644 (file)
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -77,6 +77,10 @@ struct mlx5_wqe_eth_pad {
         u8 rsvd0[16];
  };
  
+static void get_cqs(enum ib_qp_type qp_type,
+                   struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
+                   struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq);
+
  static int is_qp0(enum ib_qp_type qp_type)
  {
         return qp_type == IB_QPT_SMI;
@@ -609,6 +613,11 @@ static int to_mlx5_st(enum ib_qp_type type)
         }
  }
  
+static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq,
+                            struct mlx5_ib_cq *recv_cq);
+static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq,
+                              struct mlx5_ib_cq *recv_cq);
+
  static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn)
  {
         return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index;
@@ -649,6 +658,71 @@ err_umem:
         return err;
  }
  
+static void destroy_user_rq(struct ib_pd *pd, struct mlx5_ib_rwq *rwq)
+{
+       struct mlx5_ib_ucontext *context;
+
+       context = to_mucontext(pd->uobject->context);
+       mlx5_ib_db_unmap_user(context, &rwq->db);
+       if (rwq->umem)
+               ib_umem_release(rwq->umem);
+}
+
+static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+                         struct mlx5_ib_rwq *rwq,
+                         struct mlx5_ib_create_wq *ucmd)
+{
+       struct mlx5_ib_ucontext *context;
+       int page_shift = 0;
+       int npages;
+       u32 offset = 0;
+       int ncont = 0;
+       int err;
+
+       if (!ucmd->buf_addr)
+               return -EINVAL;
+
+       context = to_mucontext(pd->uobject->context);
+       rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr,
+                              rwq->buf_size, 0, 0);
+       if (IS_ERR(rwq->umem)) {
+               mlx5_ib_dbg(dev, "umem_get failed\n");
+               err = PTR_ERR(rwq->umem);
+               return err;
+       }
+
+       mlx5_ib_cont_pages(rwq->umem, ucmd->buf_addr, &npages, &page_shift,
+                          &ncont, NULL);
+       err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift,
+                                    &rwq->rq_page_offset);
+       if (err) {
+               mlx5_ib_warn(dev, "bad offset\n");
+               goto err_umem;
+       }
+
+       rwq->rq_num_pas = ncont;
+       rwq->page_shift = page_shift;
+       rwq->log_page_size =  page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+       rwq->wq_sig = !!(ucmd->flags & MLX5_WQ_FLAG_SIGNATURE);
+
+       mlx5_ib_dbg(dev, "addr 0x%llx, size %zd, npages %d, page_shift %d, ncont %d, offset %d\n",
+                   (unsigned long long)ucmd->buf_addr, rwq->buf_size,
+                   npages, page_shift, ncont, offset);
+
+       err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db);
+       if (err) {
+               mlx5_ib_dbg(dev, "map failed\n");
+               goto err_umem;
+       }
+
+       rwq->create_type = MLX5_WQ_USER;
+       return 0;
+
+err_umem:
+       ib_umem_release(rwq->umem);
+       return err;
+}
+
  static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                           struct mlx5_ib_qp *qp, struct ib_udata *udata,
                           struct ib_qp_init_attr *attr,
@@ -1201,6 +1275,187 @@ static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp,
         rq->doorbell = &qp->db;
  }
  
+static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+{
+       mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn);
+}
+
+static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+                                struct ib_pd *pd,
+                                struct ib_qp_init_attr *init_attr,
+                                struct ib_udata *udata)
+{
+       struct ib_uobject *uobj = pd->uobject;
+       struct ib_ucontext *ucontext = uobj->context;
+       struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+       struct mlx5_ib_create_qp_resp resp = {};
+       int inlen;
+       int err;
+       u32 *in;
+       void *tirc;
+       void *hfso;
+       u32 selected_fields = 0;
+       size_t min_resp_len;
+       u32 tdn = mucontext->tdn;
+       struct mlx5_ib_create_qp_rss ucmd = {};
+       size_t required_cmd_sz;
+
+       if (init_attr->qp_type != IB_QPT_RAW_PACKET)
+               return -EOPNOTSUPP;
+
+       if (init_attr->create_flags || init_attr->send_cq)
+               return -EINVAL;
+
+       min_resp_len = offsetof(typeof(resp), uuar_index) + sizeof(resp.uuar_index);
+       if (udata->outlen < min_resp_len)
+               return -EINVAL;
+
+       required_cmd_sz = offsetof(typeof(ucmd), reserved1) + sizeof(ucmd.reserved1);
+       if (udata->inlen < required_cmd_sz) {
+               mlx5_ib_dbg(dev, "invalid inlen\n");
+               return -EINVAL;
+       }
+
+       if (udata->inlen > sizeof(ucmd) &&
+           !ib_is_udata_cleared(udata, sizeof(ucmd),
+                                udata->inlen - sizeof(ucmd))) {
+               mlx5_ib_dbg(dev, "inlen is not supported\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
+               mlx5_ib_dbg(dev, "copy failed\n");
+               return -EFAULT;
+       }
+
+       if (ucmd.comp_mask) {
+               mlx5_ib_dbg(dev, "invalid comp mask\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved)) || ucmd.reserved1) {
+               mlx5_ib_dbg(dev, "invalid reserved\n");
+               return -EOPNOTSUPP;
+       }
+
+       err = ib_copy_to_udata(udata, &resp, min_resp_len);
+       if (err) {
+               mlx5_ib_dbg(dev, "copy failed\n");
+               return -EINVAL;
+       }
+
+       inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+       in = mlx5_vzalloc(inlen);
+       if (!in)
+               return -ENOMEM;
+
+       tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+       MLX5_SET(tirc, tirc, disp_type,
+                MLX5_TIRC_DISP_TYPE_INDIRECT);
+       MLX5_SET(tirc, tirc, indirect_table,
+                init_attr->rwq_ind_tbl->ind_tbl_num);
+       MLX5_SET(tirc, tirc, transport_domain, tdn);
+
+       hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
+       switch (ucmd.rx_hash_function) {
+       case MLX5_RX_HASH_FUNC_TOEPLITZ:
+       {
+               void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
+               size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key);
+
+               if (len != ucmd.rx_key_len) {
+                       err = -EINVAL;
+                       goto err;
+               }
+
+               MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
+               MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+               memcpy(rss_key, ucmd.rx_hash_key, len);
+               break;
+       }
+       default:
+               err = -EOPNOTSUPP;
+               goto err;
+       }
+
+       if (!ucmd.rx_hash_fields_mask) {
+               /* special case when this TIR serves as steering entry without hashing */
+               if (!init_attr->rwq_ind_tbl->log_ind_tbl_size)
+                       goto create_tir;
+               err = -EINVAL;
+               goto err;
+       }
+
+       if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) &&
+            ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
+            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) {
+               err = -EINVAL;
+               goto err;
+       }
+
+       /* If none of IPV4 & IPV6 SRC/DST was set - this bit field is ignored */
+       if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+           (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4))
+               MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+                        MLX5_L3_PROT_TYPE_IPV4);
+       else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
+                (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
+               MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+                        MLX5_L3_PROT_TYPE_IPV6);
+
+       if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) &&
+            ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
+            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))) {
+               err = -EINVAL;
+               goto err;
+       }
+
+       /* If none of TCP & UDP SRC/DST was set - this bit field is ignored */
+       if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+           (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP))
+               MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+                        MLX5_L4_PROT_TYPE_TCP);
+       else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
+                (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
+               MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+                        MLX5_L4_PROT_TYPE_UDP);
+
+       if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+           (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6))
+               selected_fields |= MLX5_HASH_FIELD_SEL_SRC_IP;
+
+       if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) ||
+           (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
+               selected_fields |= MLX5_HASH_FIELD_SEL_DST_IP;
+
+       if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+           (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP))
+               selected_fields |= MLX5_HASH_FIELD_SEL_L4_SPORT;
+
+       if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) ||
+           (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
+               selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT;
+
+       MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields);
+
+create_tir:
+       err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn);
+
+       if (err)
+               goto err;
+
+       kvfree(in);
+       /* qpn is reserved for that QP */
+       qp->trans_qp.base.mqp.qpn = 0;
+       return 0;
+
+err:
+       kvfree(in);
+       return err;
+}
+
  static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                             struct ib_qp_init_attr *init_attr,
                             struct ib_udata *udata, struct mlx5_ib_qp *qp)
@@ -1211,6 +1466,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
         struct mlx5_ib_create_qp_resp resp;
         struct mlx5_create_qp_mbox_in *in;
         struct mlx5_ib_create_qp ucmd;
+       struct mlx5_ib_cq *send_cq;
+       struct mlx5_ib_cq *recv_cq;
+       unsigned long flags;
         int inlen = sizeof(*in);
         int err;
         u32 uidx = MLX5_IB_DEFAULT_UIDX;
@@ -1227,6 +1485,14 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
         spin_lock_init(&qp->sq.lock);
         spin_lock_init(&qp->rq.lock);
  
+       if (init_attr->rwq_ind_tbl) {
+               if (!udata)
+                       return -ENOSYS;
+
+               err = create_rss_raw_qp_tir(dev, qp, pd, init_attr, udata);
+               return err;
+       }
+
         if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
                 if (!MLX5_CAP_GEN(mdev, block_lb_mc)) {
                         mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n");
@@ -1460,6 +1726,23 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
         base->container_mibqp = qp;
         base->mqp.event = mlx5_ib_qp_event;
  
+       get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq,
+               &send_cq, &recv_cq);
+       spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+       mlx5_ib_lock_cqs(send_cq, recv_cq);
+       /* Maintain device to QPs access, needed for further handling via reset
+        * flow
+        */
+       list_add_tail(&qp->qps_list, &dev->qp_list);
+       /* Maintain CQ to QPs access, needed for further handling via reset flow
+        */
+       if (send_cq)
+               list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp);
+       if (recv_cq)
+               list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp);
+       mlx5_ib_unlock_cqs(send_cq, recv_cq);
+       spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+
         return 0;
  
  err_create:
@@ -1478,23 +1761,23 @@ static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv
         if (send_cq) {
                 if (recv_cq) {
                         if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
-                               spin_lock_irq(&send_cq->lock);
+                               spin_lock(&send_cq->lock);
                                 spin_lock_nested(&recv_cq->lock,
                                                  SINGLE_DEPTH_NESTING);
                         } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
-                               spin_lock_irq(&send_cq->lock);
+                               spin_lock(&send_cq->lock);
                                 __acquire(&recv_cq->lock);
                         } else {
-                               spin_lock_irq(&recv_cq->lock);
+                               spin_lock(&recv_cq->lock);
                                 spin_lock_nested(&send_cq->lock,
                                                  SINGLE_DEPTH_NESTING);
                         }
                 } else {
-                       spin_lock_irq(&send_cq->lock);
+                       spin_lock(&send_cq->lock);
                         __acquire(&recv_cq->lock);
                 }
         } else if (recv_cq) {
-               spin_lock_irq(&recv_cq->lock);
+               spin_lock(&recv_cq->lock);
                 __acquire(&send_cq->lock);
         } else {
                 __acquire(&send_cq->lock);
@@ -1509,21 +1792,21 @@ static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *re
                 if (recv_cq) {
                         if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
                                 spin_unlock(&recv_cq->lock);
-                               spin_unlock_irq(&send_cq->lock);
+                               spin_unlock(&send_cq->lock);
                         } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
                                 __release(&recv_cq->lock);
-                               spin_unlock_irq(&send_cq->lock);
+                               spin_unlock(&send_cq->lock);
                         } else {
                                 spin_unlock(&send_cq->lock);
-                               spin_unlock_irq(&recv_cq->lock);
+                               spin_unlock(&recv_cq->lock);
                         }
                 } else {
                         __release(&recv_cq->lock);
-                       spin_unlock_irq(&send_cq->lock);
+                       spin_unlock(&send_cq->lock);
                 }
         } else if (recv_cq) {
                 __release(&send_cq->lock);
-               spin_unlock_irq(&recv_cq->lock);
+               spin_unlock(&recv_cq->lock);
         } else {
                 __release(&recv_cq->lock);
                 __release(&send_cq->lock);
@@ -1535,17 +1818,18 @@ static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp)
         return to_mpd(qp->ibqp.pd);
  }
  
-static void get_cqs(struct mlx5_ib_qp *qp,
+static void get_cqs(enum ib_qp_type qp_type,
+                   struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
                     struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq)
  {
-       switch (qp->ibqp.qp_type) {
+       switch (qp_type) {
         case IB_QPT_XRC_TGT:
                 *send_cq = NULL;
                 *recv_cq = NULL;
                 break;
         case MLX5_IB_QPT_REG_UMR:
         case IB_QPT_XRC_INI:
-               *send_cq = to_mcq(qp->ibqp.send_cq);
+               *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL;
                 *recv_cq = NULL;
                 break;
  
@@ -1557,8 +1841,8 @@ static void get_cqs(struct mlx5_ib_qp *qp,
         case IB_QPT_RAW_IPV6:
         case IB_QPT_RAW_ETHERTYPE:
         case IB_QPT_RAW_PACKET:
-               *send_cq = to_mcq(qp->ibqp.send_cq);
-               *recv_cq = to_mcq(qp->ibqp.recv_cq);
+               *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL;
+               *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL;
                 break;
  
         case IB_QPT_MAX:
@@ -1577,8 +1861,14 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
         struct mlx5_ib_cq *send_cq, *recv_cq;
         struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
         struct mlx5_modify_qp_mbox_in *in;
+       unsigned long flags;
         int err;
  
+       if (qp->ibqp.rwq_ind_tbl) {
+               destroy_rss_raw_qp_tir(dev, qp);
+               return;
+       }
+
         base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ?
                &qp->raw_packet_qp.rq.base :
                &qp->trans_qp.base;
@@ -1602,17 +1892,28 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
                                      base->mqp.qpn);
         }
  
-       get_cqs(qp, &send_cq, &recv_cq);
+       get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
+               &send_cq, &recv_cq);
+
+       spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+       mlx5_ib_lock_cqs(send_cq, recv_cq);
+       /* del from lists under both locks above to protect reset flow paths */
+       list_del(&qp->qps_list);
+       if (send_cq)
+               list_del(&qp->cq_send_list);
+
+       if (recv_cq)
+               list_del(&qp->cq_recv_list);
  
         if (qp->create_type == MLX5_QP_KERNEL) {
-               mlx5_ib_lock_cqs(send_cq, recv_cq);
                 __mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
                                    qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
                 if (send_cq != recv_cq)
                         __mlx5_ib_cq_clean(send_cq, base->mqp.qpn,
                                            NULL);
-               mlx5_ib_unlock_cqs(send_cq, recv_cq);
         }
+       mlx5_ib_unlock_cqs(send_cq, recv_cq);
+       spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
  
         if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
                 destroy_raw_packet_qp(dev, qp);
@@ -2300,7 +2601,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
         }
  
         pd = get_pd(qp);
-       get_cqs(qp, &send_cq, &recv_cq);
+       get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
+               &send_cq, &recv_cq);
  
         context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn);
         context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0;
@@ -2349,6 +2651,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
         else
                 sqd_event = 0;
  
+       if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+               u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num :
+                              qp->port) - 1;
+               struct mlx5_ib_port *mibport = &dev->port[port_num];
+
+               context->qp_counter_set_usr_page |=
+                       cpu_to_be32((u32)(mibport->q_cnt_id) << 24);
+       }
+
         if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
                 context->sq_crq_size |= cpu_to_be16(1 << 4);
  
@@ -2439,6 +2750,9 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
         int port;
         enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED;
  
+       if (ibqp->rwq_ind_tbl)
+               return -ENOSYS;
+
         if (unlikely(ibqp->qp_type == IB_QPT_GSI))
                 return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask);
  
@@ -3397,6 +3711,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
  {
         struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+       struct mlx5_core_dev *mdev = dev->mdev;
         struct mlx5_ib_qp *qp;
         struct mlx5_ib_mr *mr;
         struct mlx5_wqe_data_seg *dpseg;
@@ -3424,6 +3739,13 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
  
         spin_lock_irqsave(&qp->sq.lock, flags);
  
+       if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+               err = -EIO;
+               *bad_wr = wr;
+               nreq = 0;
+               goto out;
+       }
+
         for (nreq = 0; wr; nreq++, wr = wr->next) {
                 if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) {
                         mlx5_ib_warn(dev, "\n");
@@ -3725,6 +4047,8 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
         struct mlx5_ib_qp *qp = to_mqp(ibqp);
         struct mlx5_wqe_data_seg *scat;
         struct mlx5_rwqe_sig *sig;
+       struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+       struct mlx5_core_dev *mdev = dev->mdev;
         unsigned long flags;
         int err = 0;
         int nreq;
@@ -3736,6 +4060,13 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
  
         spin_lock_irqsave(&qp->rq.lock, flags);
  
+       if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+               err = -EIO;
+               *bad_wr = wr;
+               nreq = 0;
+               goto out;
+       }
+
         ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
  
         for (nreq = 0; wr; nreq++, wr = wr->next) {
@@ -4055,6 +4386,9 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
         int err = 0;
         u8 raw_packet_qp_state;
  
+       if (ibqp->rwq_ind_tbl)
+               return -ENOSYS;
+
         if (unlikely(ibqp->qp_type == IB_QPT_GSI))
                 return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask,
                                             qp_init_attr);
@@ -4164,3 +4498,322 @@ int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
  
         return 0;
  }
+
+static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
+                     struct ib_wq_init_attr *init_attr)
+{
+       struct mlx5_ib_dev *dev;
+       __be64 *rq_pas0;
+       void *in;
+       void *rqc;
+       void *wq;
+       int inlen;
+       int err;
+
+       dev = to_mdev(pd->device);
+
+       inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rwq->rq_num_pas;
+       in = mlx5_vzalloc(inlen);
+       if (!in)
+               return -ENOMEM;
+
+       rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+       MLX5_SET(rqc,  rqc, mem_rq_type,
+                MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE);
+       MLX5_SET(rqc, rqc, user_index, rwq->user_index);
+       MLX5_SET(rqc,  rqc, cqn, to_mcq(init_attr->cq)->mcq.cqn);
+       MLX5_SET(rqc,  rqc, state, MLX5_RQC_STATE_RST);
+       MLX5_SET(rqc,  rqc, flush_in_error_en, 1);
+       wq = MLX5_ADDR_OF(rqc, rqc, wq);
+       MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+       MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
+       MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride);
+       MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size);
+       MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn);
+       MLX5_SET(wq, wq, page_offset, rwq->rq_page_offset);
+       MLX5_SET(wq, wq, log_wq_pg_sz, rwq->log_page_size);
+       MLX5_SET(wq, wq, wq_signature, rwq->wq_sig);
+       MLX5_SET64(wq, wq, dbr_addr, rwq->db.dma);
+       rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
+       mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0);
+       err = mlx5_core_create_rq(dev->mdev, in, inlen, &rwq->rqn);
+       kvfree(in);
+       return err;
+}
+
+static int set_user_rq_size(struct mlx5_ib_dev *dev,
+                           struct ib_wq_init_attr *wq_init_attr,
+                           struct mlx5_ib_create_wq *ucmd,
+                           struct mlx5_ib_rwq *rwq)
+{
+       /* Sanity check RQ size before proceeding */
+       if (wq_init_attr->max_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_wq_sz)))
+               return -EINVAL;
+
+       if (!ucmd->rq_wqe_count)
+               return -EINVAL;
+
+       rwq->wqe_count = ucmd->rq_wqe_count;
+       rwq->wqe_shift = ucmd->rq_wqe_shift;
+       rwq->buf_size = (rwq->wqe_count << rwq->wqe_shift);
+       rwq->log_rq_stride = rwq->wqe_shift;
+       rwq->log_rq_size = ilog2(rwq->wqe_count);
+       return 0;
+}
+
+static int prepare_user_rq(struct ib_pd *pd,
+                          struct ib_wq_init_attr *init_attr,
+                          struct ib_udata *udata,
+                          struct mlx5_ib_rwq *rwq)
+{
+       struct mlx5_ib_dev *dev = to_mdev(pd->device);
+       struct mlx5_ib_create_wq ucmd = {};
+       int err;
+       size_t required_cmd_sz;
+
+       required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved);
+       if (udata->inlen < required_cmd_sz) {
+               mlx5_ib_dbg(dev, "invalid inlen\n");
+               return -EINVAL;
+       }
+
+       if (udata->inlen > sizeof(ucmd) &&
+           !ib_is_udata_cleared(udata, sizeof(ucmd),
+                                udata->inlen - sizeof(ucmd))) {
+               mlx5_ib_dbg(dev, "inlen is not supported\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
+               mlx5_ib_dbg(dev, "copy failed\n");
+               return -EFAULT;
+       }
+
+       if (ucmd.comp_mask) {
+               mlx5_ib_dbg(dev, "invalid comp mask\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (ucmd.reserved) {
+               mlx5_ib_dbg(dev, "invalid reserved\n");
+               return -EOPNOTSUPP;
+       }
+
+       err = set_user_rq_size(dev, init_attr, &ucmd, rwq);
+       if (err) {
+               mlx5_ib_dbg(dev, "err %d\n", err);
+               return err;
+       }
+
+       err = create_user_rq(dev, pd, rwq, &ucmd);
+       if (err) {
+               mlx5_ib_dbg(dev, "err %d\n", err);
+               if (err)
+                       return err;
+       }
+
+       rwq->user_index = ucmd.user_index;
+       return 0;
+}
+
+struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
+                               struct ib_wq_init_attr *init_attr,
+                               struct ib_udata *udata)
+{
+       struct mlx5_ib_dev *dev;
+       struct mlx5_ib_rwq *rwq;
+       struct mlx5_ib_create_wq_resp resp = {};
+       size_t min_resp_len;
+       int err;
+
+       if (!udata)
+               return ERR_PTR(-ENOSYS);
+
+       min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+       if (udata->outlen && udata->outlen < min_resp_len)
+               return ERR_PTR(-EINVAL);
+
+       dev = to_mdev(pd->device);
+       switch (init_attr->wq_type) {
+       case IB_WQT_RQ:
+               rwq = kzalloc(sizeof(*rwq), GFP_KERNEL);
+               if (!rwq)
+                       return ERR_PTR(-ENOMEM);
+               err = prepare_user_rq(pd, init_attr, udata, rwq);
+               if (err)
+                       goto err;
+               err = create_rq(rwq, pd, init_attr);
+               if (err)
+                       goto err_user_rq;
+               break;
+       default:
+               mlx5_ib_dbg(dev, "unsupported wq type %d\n",
+                           init_attr->wq_type);
+               return ERR_PTR(-EINVAL);
+       }
+
+       rwq->ibwq.wq_num = rwq->rqn;
+       rwq->ibwq.state = IB_WQS_RESET;
+       if (udata->outlen) {
+               resp.response_length = offsetof(typeof(resp), response_length) +
+                               sizeof(resp.response_length);
+               err = ib_copy_to_udata(udata, &resp, resp.response_length);
+               if (err)
+                       goto err_copy;
+       }
+
+       return &rwq->ibwq;
+
+err_copy:
+       mlx5_core_destroy_rq(dev->mdev, rwq->rqn);
+err_user_rq:
+       destroy_user_rq(pd, rwq);
+err:
+       kfree(rwq);
+       return ERR_PTR(err);
+}
+
+int mlx5_ib_destroy_wq(struct ib_wq *wq)
+{
+       struct mlx5_ib_dev *dev = to_mdev(wq->device);
+       struct mlx5_ib_rwq *rwq = to_mrwq(wq);
+
+       mlx5_core_destroy_rq(dev->mdev, rwq->rqn);
+       destroy_user_rq(wq->pd, rwq);
+       kfree(rwq);
+
+       return 0;
+}
+
+struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
+                                                     struct ib_rwq_ind_table_init_attr *init_attr,
+                                                     struct ib_udata *udata)
+{
+       struct mlx5_ib_dev *dev = to_mdev(device);
+       struct mlx5_ib_rwq_ind_table *rwq_ind_tbl;
+       int sz = 1 << init_attr->log_ind_tbl_size;
+       struct mlx5_ib_create_rwq_ind_tbl_resp resp = {};
+       size_t min_resp_len;
+       int inlen;
+       int err;
+       int i;
+       u32 *in;
+       void *rqtc;
+
+       if (udata->inlen > 0 &&
+           !ib_is_udata_cleared(udata, 0,
+                                udata->inlen))
+               return ERR_PTR(-EOPNOTSUPP);
+
+       min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+       if (udata->outlen && udata->outlen < min_resp_len)
+               return ERR_PTR(-EINVAL);
+
+       rwq_ind_tbl = kzalloc(sizeof(*rwq_ind_tbl), GFP_KERNEL);
+       if (!rwq_ind_tbl)
+               return ERR_PTR(-ENOMEM);
+
+       inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
+       in = mlx5_vzalloc(inlen);
+       if (!in) {
+               err = -ENOMEM;
+               goto err;
+       }
+
+       rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
+
+       MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
+       MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
+
+       for (i = 0; i < sz; i++)
+               MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num);
+
+       err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn);
+       kvfree(in);
+
+       if (err)
+               goto err;
+
+       rwq_ind_tbl->ib_rwq_ind_tbl.ind_tbl_num = rwq_ind_tbl->rqtn;
+       if (udata->outlen) {
+               resp.response_length = offsetof(typeof(resp), response_length) +
+                                       sizeof(resp.response_length);
+               err = ib_copy_to_udata(udata, &resp, resp.response_length);
+               if (err)
+                       goto err_copy;
+       }
+
+       return &rwq_ind_tbl->ib_rwq_ind_tbl;
+
+err_copy:
+       mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn);
+err:
+       kfree(rwq_ind_tbl);
+       return ERR_PTR(err);
+}
+
+int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
+{
+       struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl);
+       struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device);
+
+       mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn);
+
+       kfree(rwq_ind_tbl);
+       return 0;
+}
+
+int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+                     u32 wq_attr_mask, struct ib_udata *udata)
+{
+       struct mlx5_ib_dev *dev = to_mdev(wq->device);
+       struct mlx5_ib_rwq *rwq = to_mrwq(wq);
+       struct mlx5_ib_modify_wq ucmd = {};
+       size_t required_cmd_sz;
+       int curr_wq_state;
+       int wq_state;
+       int inlen;
+       int err;
+       void *rqc;
+       void *in;
+
+       required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved);
+       if (udata->inlen < required_cmd_sz)
+               return -EINVAL;
+
+       if (udata->inlen > sizeof(ucmd) &&
+           !ib_is_udata_cleared(udata, sizeof(ucmd),
+                                udata->inlen - sizeof(ucmd)))
+               return -EOPNOTSUPP;
+
+       if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)))
+               return -EFAULT;
+
+       if (ucmd.comp_mask || ucmd.reserved)
+               return -EOPNOTSUPP;
+
+       inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+       in = mlx5_vzalloc(inlen);
+       if (!in)
+               return -ENOMEM;
+
+       rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+       curr_wq_state = (wq_attr_mask & IB_WQ_CUR_STATE) ?
+               wq_attr->curr_wq_state : wq->state;
+       wq_state = (wq_attr_mask & IB_WQ_STATE) ?
+               wq_attr->wq_state : curr_wq_state;
+       if (curr_wq_state == IB_WQS_ERR)
+               curr_wq_state = MLX5_RQC_STATE_ERR;
+       if (wq_state == IB_WQS_ERR)
+               wq_state = MLX5_RQC_STATE_ERR;
+       MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state);
+       MLX5_SET(rqc, rqc, state, wq_state);
+
+       err = mlx5_core_modify_rq(dev->mdev, rwq->rqn, in, inlen);
+       kvfree(in);
+       if (!err)
+               rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state;
+
+       return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c

index 3b2ddd64a371689e1533cb08c23007a4d6016b03..ed6ac52355f1b9ddc81d4d8a13cdf16d1904c0e6 100644 (file)
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -74,14 +74,12 @@ static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type)
  }
  
  static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
-                          struct mlx5_create_srq_mbox_in **in,
-                          struct ib_udata *udata, int buf_size, int *inlen,
-                          int is_xrc)
+                          struct mlx5_srq_attr *in,
+                          struct ib_udata *udata, int buf_size)
  {
         struct mlx5_ib_dev *dev = to_mdev(pd->device);
         struct mlx5_ib_create_srq ucmd = {};
         size_t ucmdlen;
-       void *xsrqc;
         int err;
         int npages;
         int page_shift;
@@ -104,7 +102,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
                                  udata->inlen - sizeof(ucmd)))
                 return -EINVAL;
  
-       if (is_xrc) {
+       if (in->type == IB_SRQT_XRC) {
                 err = get_srq_user_index(to_mucontext(pd->uobject->context),
                                          &ucmd, udata->inlen, &uidx);
                 if (err)
@@ -130,14 +128,13 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
                 goto err_umem;
         }
  
-       *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;
-       *in = mlx5_vzalloc(*inlen);
-       if (!(*in)) {
+       in->pas = mlx5_vzalloc(sizeof(*in->pas) * ncont);
+       if (!in->pas) {
                 err = -ENOMEM;
                 goto err_umem;
         }
  
-       mlx5_ib_populate_pas(dev, srq->umem, page_shift, (*in)->pas, 0);
+       mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0);
  
         err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context),
                                   ucmd.db_addr, &srq->db);
@@ -146,20 +143,16 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
                 goto err_in;
         }
  
-       (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
-       (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);
-
-       if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) &&
-            is_xrc){
-               xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
-                                    xrc_srq_context_entry);
-               MLX5_SET(xrc_srqc, xsrqc, user_index, uidx);
-       }
+       in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+       in->page_offset = offset;
+       if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
+           in->type == IB_SRQT_XRC)
+               in->user_index = uidx;
  
         return 0;
  
  err_in:
-       kvfree(*in);
+       kvfree(in->pas);
  
  err_umem:
         ib_umem_release(srq->umem);
@@ -168,15 +161,13 @@ err_umem:
  }
  
  static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
-                            struct mlx5_create_srq_mbox_in **in, int buf_size,
-                            int *inlen, int is_xrc)
+                            struct mlx5_srq_attr *in, int buf_size)
  {
         int err;
         int i;
         struct mlx5_wqe_srq_next_seg *next;
         int page_shift;
         int npages;
-       void *xsrqc;
  
         err = mlx5_db_alloc(dev->mdev, &srq->db);
         if (err) {
@@ -204,13 +195,12 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
         npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT));
         mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n",
                     buf_size, page_shift, srq->buf.npages, npages);
-       *inlen = sizeof(**in) + sizeof(*(*in)->pas) * npages;
-       *in = mlx5_vzalloc(*inlen);
-       if (!*in) {
+       in->pas = mlx5_vzalloc(sizeof(*in->pas) * npages);
+       if (!in->pas) {
                 err = -ENOMEM;
                 goto err_buf;
         }
-       mlx5_fill_page_array(&srq->buf, (*in)->pas);
+       mlx5_fill_page_array(&srq->buf, in->pas);
  
         srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL);
         if (!srq->wrid) {
@@ -221,20 +211,15 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
         }
         srq->wq_sig = !!srq_signature;
  
-       (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
-
-       if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) &&
-            is_xrc){
-               xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
-                                    xrc_srq_context_entry);
-               /* 0xffffff means we ask to work with cqe version 0 */
-               MLX5_SET(xrc_srqc, xsrqc, user_index, MLX5_IB_DEFAULT_UIDX);
-       }
+       in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+       if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
+           in->type == IB_SRQT_XRC)
+               in->user_index = MLX5_IB_DEFAULT_UIDX;
  
         return 0;
  
  err_in:
-       kvfree(*in);
+       kvfree(in->pas);
  
  err_buf:
         mlx5_buf_free(dev->mdev, &srq->buf);
@@ -267,10 +252,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
         int desc_size;
         int buf_size;
         int err;
-       struct mlx5_create_srq_mbox_in *uninitialized_var(in);
-       int uninitialized_var(inlen);
-       int is_xrc;
-       u32 flgs, xrcdn;
+       struct mlx5_srq_attr in = {0};
         __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
  
         /* Sanity check SRQ size before proceeding */
@@ -302,14 +284,10 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
                     desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs,
                     srq->msrq.max_avail_gather);
  
-       is_xrc = (init_attr->srq_type == IB_SRQT_XRC);
-
         if (pd->uobject)
-               err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen,
-                                     is_xrc);
+               err = create_srq_user(pd, srq, &in, udata, buf_size);
         else
-               err = create_srq_kernel(dev, srq, &in, buf_size, &inlen,
-                                       is_xrc);
+               err = create_srq_kernel(dev, srq, &in, buf_size);
  
         if (err) {
                 mlx5_ib_warn(dev, "create srq %s failed, err %d\n",
@@ -317,23 +295,23 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
                 goto err_srq;
         }
  
-       in->ctx.state_log_sz = ilog2(srq->msrq.max);
-       flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24;
-       xrcdn = 0;
-       if (is_xrc) {
-               xrcdn = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn;
-               in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(init_attr->ext.xrc.cq)->mcq.cqn);
+       in.type = init_attr->srq_type;
+       in.log_size = ilog2(srq->msrq.max);
+       in.wqe_shift = srq->msrq.wqe_shift - 4;
+       if (srq->wq_sig)
+               in.flags |= MLX5_SRQ_FLAG_WQ_SIG;
+       if (init_attr->srq_type == IB_SRQT_XRC) {
+               in.xrcd = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn;
+               in.cqn = to_mcq(init_attr->ext.xrc.cq)->mcq.cqn;
         } else if (init_attr->srq_type == IB_SRQT_BASIC) {
-               xrcdn = to_mxrcd(dev->devr.x0)->xrcdn;
-               in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(dev->devr.c0)->mcq.cqn);
+               in.xrcd = to_mxrcd(dev->devr.x0)->xrcdn;
+               in.cqn = to_mcq(dev->devr.c0)->mcq.cqn;
         }
  
-       in->ctx.flags_xrcd = cpu_to_be32((flgs & 0xFF000000) | (xrcdn & 0xFFFFFF));
-
-       in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn);
-       in->ctx.db_record = cpu_to_be64(srq->db.dma);
-       err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen, is_xrc);
-       kvfree(in);
+       in.pd = to_mpd(pd)->pdn;
+       in.db_record = srq->db.dma;
+       err = mlx5_core_create_srq(dev->mdev, &srq->msrq, &in);
+       kvfree(in.pas);
         if (err) {
                 mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err);
                 goto err_usr_kern_srq;
@@ -401,7 +379,7 @@ int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
         struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
         struct mlx5_ib_srq *srq = to_msrq(ibsrq);
         int ret;
-       struct mlx5_query_srq_mbox_out *out;
+       struct mlx5_srq_attr *out;
  
         out = kzalloc(sizeof(*out), GFP_KERNEL);
         if (!out)
@@ -411,7 +389,7 @@ int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
         if (ret)
                 goto out_box;
  
-       srq_attr->srq_limit = be16_to_cpu(out->ctx.lwm);
+       srq_attr->srq_limit = out->lwm;
         srq_attr->max_wr    = srq->msrq.max - 1;
         srq_attr->max_sge   = srq->msrq.max_gs;
  
@@ -458,6 +436,8 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
         struct mlx5_ib_srq *srq = to_msrq(ibsrq);
         struct mlx5_wqe_srq_next_seg *next;
         struct mlx5_wqe_data_seg *scat;
+       struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
+       struct mlx5_core_dev *mdev = dev->mdev;
         unsigned long flags;
         int err = 0;
         int nreq;
@@ -465,6 +445,12 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
  
         spin_lock_irqsave(&srq->lock, flags);
  
+       if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+               err = -EIO;
+               *bad_wr = wr;
+               goto out;
+       }
+
         for (nreq = 0; wr; nreq++, wr = wr->next) {
                 if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
                         err = -EINVAL;
@@ -507,7 +493,7 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
  
                 *srq->db.db = cpu_to_be32(srq->wqe_ctr);
         }
-
+out:
         spin_unlock_irqrestore(&srq->lock, flags);
  
         return err;
diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h

index 61bc308bb802ce93069601e85507849a8f61e98d..188dac4301b53f4415c4c504fe06df0b408357cd 100644 (file)
--- a/drivers/infiniband/hw/mlx5/user.h
+++ b/drivers/infiniband/hw/mlx5/user.h
@@ -46,6 +46,10 @@ enum {
         MLX5_SRQ_FLAG_SIGNATURE         = 1 << 0,
  };
  
+enum {
+       MLX5_WQ_FLAG_SIGNATURE          = 1 << 0,
+};
+
  
  /* Increment this value if any changes that break userspace ABI
   * compatibility are made.
@@ -79,6 +83,10 @@ enum mlx5_ib_alloc_ucontext_resp_mask {
         MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0,
  };
  
+enum mlx5_user_cmds_supp_uhw {
+       MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE = 1 << 0,
+};
+
  struct mlx5_ib_alloc_ucontext_resp {
         __u32   qp_tab_size;
         __u32   bf_reg_size;
@@ -94,8 +102,8 @@ struct mlx5_ib_alloc_ucontext_resp {
         __u32   comp_mask;
         __u32   response_length;
         __u8    cqe_version;
-       __u8    reserved2;
-       __u16   reserved3;
+       __u8    cmds_supp_uhw;
+       __u16   reserved2;
         __u64   hca_core_clock_offset;
  };
  
@@ -103,6 +111,22 @@ struct mlx5_ib_alloc_pd_resp {
         __u32   pdn;
  };
  
+struct mlx5_ib_tso_caps {
+       __u32 max_tso; /* Maximum tso payload size in bytes */
+
+       /* Corresponding bit will be set if qp type from
+        * 'enum ib_qp_type' is supported, e.g.
+        * supported_qpts |= 1 << IB_QPT_UD
+        */
+       __u32 supported_qpts;
+};
+
+struct mlx5_ib_query_device_resp {
+       __u32   comp_mask;
+       __u32   response_length;
+       struct  mlx5_ib_tso_caps tso_caps;
+};
+
  struct mlx5_ib_create_cq {
         __u64   buf_addr;
         __u64   db_addr;
@@ -148,6 +172,40 @@ struct mlx5_ib_create_qp {
         __u64   sq_buf_addr;
  };
  
+/* RX Hash function flags */
+enum mlx5_rx_hash_function_flags {
+       MLX5_RX_HASH_FUNC_TOEPLITZ      = 1 << 0,
+};
+
+/*
+ * RX Hash flags, these flags allows to set which incoming packet's field should
+ * participates in RX Hash. Each flag represent certain packet's field,
+ * when the flag is set the field that is represented by the flag will
+ * participate in RX Hash calculation.
+ * Note: *IPV4 and *IPV6 flags can't be enabled together on the same QP
+ * and *TCP and *UDP flags can't be enabled together on the same QP.
+*/
+enum mlx5_rx_hash_fields {
+       MLX5_RX_HASH_SRC_IPV4   = 1 << 0,
+       MLX5_RX_HASH_DST_IPV4   = 1 << 1,
+       MLX5_RX_HASH_SRC_IPV6   = 1 << 2,
+       MLX5_RX_HASH_DST_IPV6   = 1 << 3,
+       MLX5_RX_HASH_SRC_PORT_TCP       = 1 << 4,
+       MLX5_RX_HASH_DST_PORT_TCP       = 1 << 5,
+       MLX5_RX_HASH_SRC_PORT_UDP       = 1 << 6,
+       MLX5_RX_HASH_DST_PORT_UDP       = 1 << 7
+};
+
+struct mlx5_ib_create_qp_rss {
+       __u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */
+       __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */
+       __u8 rx_key_len; /* valid only for Toeplitz */
+       __u8 reserved[6];
+       __u8 rx_hash_key[128]; /* valid only for Toeplitz */
+       __u32   comp_mask;
+       __u32   reserved1;
+};
+
  struct mlx5_ib_create_qp_resp {
         __u32   uuar_index;
  };
@@ -159,6 +217,32 @@ struct mlx5_ib_alloc_mw {
         __u16   reserved2;
  };
  
+struct mlx5_ib_create_wq {
+       __u64   buf_addr;
+       __u64   db_addr;
+       __u32   rq_wqe_count;
+       __u32   rq_wqe_shift;
+       __u32   user_index;
+       __u32   flags;
+       __u32   comp_mask;
+       __u32   reserved;
+};
+
+struct mlx5_ib_create_wq_resp {
+       __u32   response_length;
+       __u32   reserved;
+};
+
+struct mlx5_ib_create_rwq_ind_tbl_resp {
+       __u32   response_length;
+       __u32   reserved;
+};
+
+struct mlx5_ib_modify_wq {
+       __u32   comp_mask;
+       __u32   reserved;
+};
+
  static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext,
                                     struct mlx5_ib_create_qp *ucmd,
                                     int inlen,
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c

index 9866c35cc977d5036b8b7c6b1126a3508eef9212..da2335f7f7c357b9c0266ac9c225c47e94a63994 100644 (file)
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -1081,16 +1081,6 @@ static ssize_t show_rev(struct device *device, struct device_attribute *attr,
         return sprintf(buf, "%x\n", dev->rev_id);
  }
  
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
-                          char *buf)
-{
-       struct mthca_dev *dev =
-               container_of(device, struct mthca_dev, ib_dev.dev);
-       return sprintf(buf, "%d.%d.%d\n", (int) (dev->fw_ver >> 32),
-                      (int) (dev->fw_ver >> 16) & 0xffff,
-                      (int) dev->fw_ver & 0xffff);
-}
-
  static ssize_t show_hca(struct device *device, struct device_attribute *attr,
                         char *buf)
  {
@@ -1120,13 +1110,11 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr,
  }
  
  static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
-static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
  static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
  static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
  
  static struct device_attribute *mthca_dev_attributes[] = {
         &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
         &dev_attr_hca_type,
         &dev_attr_board_id
  };
@@ -1187,6 +1175,17 @@ static int mthca_port_immutable(struct ib_device *ibdev, u8 port_num,
         return 0;
  }
  
+static void get_dev_fw_str(struct ib_device *device, char *str,
+                          size_t str_len)
+{
+       struct mthca_dev *dev =
+               container_of(device, struct mthca_dev, ib_dev);
+       snprintf(str, str_len, "%d.%d.%d",
+                (int) (dev->fw_ver >> 32),
+                (int) (dev->fw_ver >> 16) & 0xffff,
+                (int) dev->fw_ver & 0xffff);
+}
+
  int mthca_register_device(struct mthca_dev *dev)
  {
         int ret;
@@ -1266,6 +1265,7 @@ int mthca_register_device(struct mthca_dev *dev)
         dev->ib_dev.reg_user_mr          = mthca_reg_user_mr;
         dev->ib_dev.dereg_mr             = mthca_dereg_mr;
         dev->ib_dev.get_port_immutable   = mthca_port_immutable;
+       dev->ib_dev.get_dev_fw_str       = get_dev_fw_str;
  
         if (dev->mthca_flags & MTHCA_FLAG_FMR) {
                 dev->ib_dev.alloc_fmr            = mthca_alloc_fmr;
diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c

index 74c6a9426047b8a34a419cddd39294b294d2fe38..6727af27c017db48c5aaa5cc6b549d68671d65fe 100644 (file)
--- a/drivers/infiniband/hw/mthca/mthca_reset.c
+++ b/drivers/infiniband/hw/mthca/mthca_reset.c
@@ -98,7 +98,7 @@ int mthca_reset(struct mthca_dev *mdev)
                 err = -ENOMEM;
                 mthca_err(mdev, "Couldn't allocate memory to save HCA "
                           "PCI header, aborting.\n");
-               goto out;
+               goto put_dev;
         }
  
         for (i = 0; i < 64; ++i) {
@@ -108,7 +108,7 @@ int mthca_reset(struct mthca_dev *mdev)
                         err = -ENODEV;
                         mthca_err(mdev, "Couldn't save HCA "
                                   "PCI header, aborting.\n");
-                       goto out;
+                       goto free_hca;
                 }
         }
  
@@ -121,7 +121,7 @@ int mthca_reset(struct mthca_dev *mdev)
                         err = -ENOMEM;
                         mthca_err(mdev, "Couldn't allocate memory to save HCA "
                                   "bridge PCI header, aborting.\n");
-                       goto out;
+                       goto free_hca;
                 }
  
                 for (i = 0; i < 64; ++i) {
@@ -131,7 +131,7 @@ int mthca_reset(struct mthca_dev *mdev)
                                 err = -ENODEV;
                                 mthca_err(mdev, "Couldn't save HCA bridge "
                                           "PCI header, aborting.\n");
-                               goto out;
+                               goto free_bh;
                         }
                 }
                 bridge_pcix_cap = pci_find_capability(bridge, PCI_CAP_ID_PCIX);
@@ -139,7 +139,7 @@ int mthca_reset(struct mthca_dev *mdev)
                                 err = -ENODEV;
                                 mthca_err(mdev, "Couldn't locate HCA bridge "
                                           "PCI-X capability, aborting.\n");
-                               goto out;
+                               goto free_bh;
                 }
         }
  
@@ -152,7 +152,7 @@ int mthca_reset(struct mthca_dev *mdev)
                         err = -ENOMEM;
                         mthca_err(mdev, "Couldn't map HCA reset register, "
                                   "aborting.\n");
-                       goto out;
+                       goto free_bh;
                 }
  
                 writel(MTHCA_RESET_VALUE, reset);
@@ -172,7 +172,7 @@ int mthca_reset(struct mthca_dev *mdev)
                                 err = -ENODEV;
                                 mthca_err(mdev, "Couldn't access HCA after reset, "
                                           "aborting.\n");
-                               goto out;
+                               goto free_bh;
                         }
  
                         if (v != 0xffffffff)
@@ -184,7 +184,7 @@ int mthca_reset(struct mthca_dev *mdev)
                 err = -ENODEV;
                 mthca_err(mdev, "PCI device did not come back after reset, "
                           "aborting.\n");
-               goto out;
+               goto free_bh;
         }
  
  good:
@@ -195,14 +195,14 @@ good:
                         err = -ENODEV;
                         mthca_err(mdev, "Couldn't restore HCA bridge Upstream "
                                   "split transaction control, aborting.\n");
-                       goto out;
+                       goto free_bh;
                 }
                 if (pci_write_config_dword(bridge, bridge_pcix_cap + 0xc,
                                  bridge_header[(bridge_pcix_cap + 0xc) / 4])) {
                         err = -ENODEV;
                         mthca_err(mdev, "Couldn't restore HCA bridge Downstream "
                                   "split transaction control, aborting.\n");
-                       goto out;
+                       goto free_bh;
                 }
                 /*
                  * Bridge control register is at 0x3e, so we'll
@@ -216,7 +216,7 @@ good:
                                 err = -ENODEV;
                                 mthca_err(mdev, "Couldn't restore HCA bridge reg %x, "
                                           "aborting.\n", i);
-                               goto out;
+                               goto free_bh;
                         }
                 }
  
@@ -225,7 +225,7 @@ good:
                         err = -ENODEV;
                         mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, "
                                   "aborting.\n");
-                       goto out;
+                       goto free_bh;
                 }
         }
  
@@ -235,7 +235,7 @@ good:
                         err = -ENODEV;
                         mthca_err(mdev, "Couldn't restore HCA PCI-X "
                                   "command register, aborting.\n");
-                       goto out;
+                       goto free_bh;
                 }
         }
  
@@ -246,7 +246,7 @@ good:
                         err = -ENODEV;
                         mthca_err(mdev, "Couldn't restore HCA PCI Express "
                                   "Device Control register, aborting.\n");
-                       goto out;
+                       goto free_bh;
                 }
                 linkctl = hca_header[(hca_pcie_cap + PCI_EXP_LNKCTL) / 4];
                 if (pcie_capability_write_word(mdev->pdev, PCI_EXP_LNKCTL,
@@ -254,7 +254,7 @@ good:
                         err = -ENODEV;
                         mthca_err(mdev, "Couldn't restore HCA PCI Express "
                                   "Link control register, aborting.\n");
-                       goto out;
+                       goto free_bh;
                 }
         }
  
@@ -266,7 +266,7 @@ good:
                         err = -ENODEV;
                         mthca_err(mdev, "Couldn't restore HCA reg %x, "
                                   "aborting.\n", i);
-                       goto out;
+                       goto free_bh;
                 }
         }
  
@@ -275,14 +275,12 @@ good:
                 err = -ENODEV;
                 mthca_err(mdev, "Couldn't restore HCA COMMAND, "
                           "aborting.\n");
-               goto out;
         }
-
-out:
-       if (bridge)
-               pci_dev_put(bridge);
+free_bh:
         kfree(bridge_header);
+free_hca:
         kfree(hca_header);
-
+put_dev:
+       pci_dev_put(bridge);
         return err;
  }
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c

index 464d6da5fe913707f18ff95cc3df93a3b36c4ee7..bd69125731c1816245fc90c2bfe7d4cbffb8c02a 100644 (file)
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -2605,23 +2605,6 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
  }
  
  
-/**
- * show_fw_ver
- */
-static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
-                          char *buf)
-{
-       struct nes_ib_device *nesibdev =
-                       container_of(dev, struct nes_ib_device, ibdev.dev);
-       struct nes_vnic *nesvnic = nesibdev->nesvnic;
-
-       nes_debug(NES_DBG_INIT, "\n");
-       return sprintf(buf, "%u.%u\n",
-               (nesvnic->nesdev->nesadapter->firmware_version >> 16),
-               (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff));
-}
-
-
  /**
   * show_hca
   */
@@ -2645,13 +2628,11 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr,
  
  
  static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
  static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
  static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
  
  static struct device_attribute *nes_dev_attributes[] = {
         &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
         &dev_attr_hca_type,
         &dev_attr_board_id
  };
@@ -3703,6 +3684,19 @@ static int nes_port_immutable(struct ib_device *ibdev, u8 port_num,
         return 0;
  }
  
+static void get_dev_fw_str(struct ib_device *dev, char *str,
+                          size_t str_len)
+{
+       struct nes_ib_device *nesibdev =
+                       container_of(dev, struct nes_ib_device, ibdev);
+       struct nes_vnic *nesvnic = nesibdev->nesvnic;
+
+       nes_debug(NES_DBG_INIT, "\n");
+       snprintf(str, str_len, "%u.%u",
+                (nesvnic->nesdev->nesadapter->firmware_version >> 16),
+                (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff));
+}
+
  /**
   * nes_init_ofa_device
   */
@@ -3802,6 +3796,7 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
         nesibdev->ibdev.iwcm->create_listen = nes_create_listen;
         nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen;
         nesibdev->ibdev.get_port_immutable   = nes_port_immutable;
+       nesibdev->ibdev.get_dev_fw_str   = get_dev_fw_str;
         memcpy(nesibdev->ibdev.iwcm->ifname, netdev->name,
                sizeof(nesibdev->ibdev.iwcm->ifname));
  
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c

index 3d75f65ce87e207bf958b44fe3beb98f7666f6ea..07d0c6c5b0465755a48f71add1d98e7e61f215bc 100644 (file)
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -107,6 +107,14 @@ static int ocrdma_port_immutable(struct ib_device *ibdev, u8 port_num,
         return 0;
  }
  
+static void get_dev_fw_str(struct ib_device *device, char *str,
+                          size_t str_len)
+{
+       struct ocrdma_dev *dev = get_ocrdma_dev(device);
+
+       snprintf(str, str_len, "%s", &dev->attr.fw_ver[0]);
+}
+
  static int ocrdma_register_device(struct ocrdma_dev *dev)
  {
         strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX);
@@ -193,6 +201,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
  
         dev->ibdev.process_mad = ocrdma_process_mad;
         dev->ibdev.get_port_immutable = ocrdma_port_immutable;
+       dev->ibdev.get_dev_fw_str     = get_dev_fw_str;
  
         if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
                 dev->ibdev.uverbs_cmd_mask |=
@@ -262,14 +271,6 @@ static ssize_t show_rev(struct device *device, struct device_attribute *attr,
         return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor);
  }
  
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
-                       char *buf)
-{
-       struct ocrdma_dev *dev = dev_get_drvdata(device);
-
-       return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->attr.fw_ver[0]);
-}
-
  static ssize_t show_hca_type(struct device *device,
                              struct device_attribute *attr, char *buf)
  {
@@ -279,12 +280,10 @@ static ssize_t show_hca_type(struct device *device,
  }
  
  static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
  static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL);
  
  static struct device_attribute *ocrdma_attributes[] = {
         &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
         &dev_attr_hca_type
  };
  
diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c

index 575b737d9ef3d59441c35545f22ebdf53731b835..9cc0aae1d78191735b31eaed33bca1ee21de7d01 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_qp.c
+++ b/drivers/infiniband/hw/qib/qib_qp.c
@@ -106,6 +106,49 @@ static u32 credit_table[31] = {
         32768                   /* 1E */
  };
  
+const struct rvt_operation_params qib_post_parms[RVT_OPERATION_MAX] = {
+[IB_WR_RDMA_WRITE] = {
+       .length = sizeof(struct ib_rdma_wr),
+       .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_RDMA_READ] = {
+       .length = sizeof(struct ib_rdma_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_ATOMIC,
+},
+
+[IB_WR_ATOMIC_CMP_AND_SWP] = {
+       .length = sizeof(struct ib_atomic_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_ATOMIC_FETCH_AND_ADD] = {
+       .length = sizeof(struct ib_atomic_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_RDMA_WRITE_WITH_IMM] = {
+       .length = sizeof(struct ib_rdma_wr),
+       .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND] = {
+       .length = sizeof(struct ib_send_wr),
+       .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+                      BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND_WITH_IMM] = {
+       .length = sizeof(struct ib_send_wr),
+       .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+                      BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+};
+
  static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map,
                          gfp_t gfp)
  {
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c

index 846e6c726df7c67204f1dc1d5f1bfb066c36e149..10d062561bd96f49617a754d26873353b39d78a4 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -169,8 +169,12 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
         }
  
         if (ah_attr->ah_flags & IB_AH_GRH) {
-               qib_copy_sge(&qp->r_sge, &ah_attr->grh,
-                            sizeof(struct ib_grh), 1);
+               struct ib_grh grh;
+               struct ib_global_route grd = ah_attr->grh;
+
+               qib_make_grh(ibp, &grh, &grd, 0, 0);
+               qib_copy_sge(&qp->r_sge, &grh,
+                            sizeof(grh), 1);
                 wc.wc_flags |= IB_WC_GRH;
         } else
                 qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c

index cbf6200e6afc06b9ba7dc485e00cd807ea78872f..fd1dfbce5539742cb6b5a2ed994b213c50d5672a 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -1582,6 +1582,8 @@ static void qib_fill_device_attr(struct qib_devdata *dd)
         rdi->dparms.props.max_total_mcast_qp_attach =
                                         rdi->dparms.props.max_mcast_qp_attach *
                                         rdi->dparms.props.max_mcast_grp;
+       /* post send table */
+       dd->verbs_dev.rdi.post_parms = qib_post_parms;
  }
  
  /**
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h

index 4f878151f81ff43263ea9524f10b09918f94e565..736ced68484221de35433a9a95e6619fdddaeecc 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -497,4 +497,6 @@ extern unsigned int ib_qib_max_srq_wrs;
  
  extern const u32 ib_qib_rnr_table[];
  
+extern const struct rvt_operation_params qib_post_parms[];
+
  #endif                          /* QIB_VERBS_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c

index 565c881a44ba069f69f64da6b1a215fdc3bf7f19..c229b9f4a52da65d8559d832d930e0c73d7109d9 100644 (file)
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -331,6 +331,21 @@ static int usnic_port_immutable(struct ib_device *ibdev, u8 port_num,
         return 0;
  }
  
+static void usnic_get_dev_fw_str(struct ib_device *device,
+                                char *str,
+                                size_t str_len)
+{
+       struct usnic_ib_dev *us_ibdev =
+               container_of(device, struct usnic_ib_dev, ib_dev);
+       struct ethtool_drvinfo info;
+
+       mutex_lock(&us_ibdev->usdev_lock);
+       us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info);
+       mutex_unlock(&us_ibdev->usdev_lock);
+
+       snprintf(str, str_len, "%s", info.fw_version);
+}
+
  /* Start of PF discovery section */
  static void *usnic_ib_device_add(struct pci_dev *dev)
  {
@@ -414,6 +429,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
         us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq;
         us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr;
         us_ibdev->ib_dev.get_port_immutable = usnic_port_immutable;
+       us_ibdev->ib_dev.get_dev_fw_str     = usnic_get_dev_fw_str;
  
  
         if (ib_register_device(&us_ibdev->ib_dev, NULL))
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c

index 3412ea06116e2cca6ba802571edd5e48186edc7f..80ef3f8998c87136a6f941e151138f1440aa031e 100644 (file)
--- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
@@ -45,21 +45,6 @@
  #include "usnic_ib_verbs.h"
  #include "usnic_log.h"
  
-static ssize_t usnic_ib_show_fw_ver(struct device *device,
-                                       struct device_attribute *attr,
-                                       char *buf)
-{
-       struct usnic_ib_dev *us_ibdev =
-               container_of(device, struct usnic_ib_dev, ib_dev.dev);
-       struct ethtool_drvinfo info;
-
-       mutex_lock(&us_ibdev->usdev_lock);
-       us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info);
-       mutex_unlock(&us_ibdev->usdev_lock);
-
-       return scnprintf(buf, PAGE_SIZE, "%s\n", info.fw_version);
-}
-
  static ssize_t usnic_ib_show_board(struct device *device,
                                         struct device_attribute *attr,
                                         char *buf)
@@ -192,7 +177,6 @@ usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr,
                         us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]);
  }
  
-static DEVICE_ATTR(fw_ver, S_IRUGO, usnic_ib_show_fw_ver, NULL);
  static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL);
  static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL);
  static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL);
@@ -201,7 +185,6 @@ static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL);
  static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL);
  
  static struct device_attribute *usnic_class_attributes[] = {
-       &dev_attr_fw_ver,
         &dev_attr_board_id,
         &dev_attr_config,
         &dev_attr_iface,
diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile

index 988b6a0101a4de289cee23032507fc5e72915c8c..8b095b27db8705c822df6501b1d11621a3931fac 100644 (file)
--- a/drivers/infiniband/sw/Makefile
+++ b/drivers/infiniband/sw/Makefile
@@ -1 +1,2 @@
  obj-$(CONFIG_INFINIBAND_RDMAVT)                += rdmavt/
+obj-$(CONFIG_RDMA_RXE)                 += rxe/
diff --git a/drivers/infiniband/sw/rdmavt/Kconfig b/drivers/infiniband/sw/rdmavt/Kconfig

index 11aa6a34bd71d9eaa5d68838ee5c559bed8cd965..1da8d01a68550b3ec361581321995e82e212f208 100644 (file)
--- a/drivers/infiniband/sw/rdmavt/Kconfig
+++ b/drivers/infiniband/sw/rdmavt/Kconfig
@@ -1,6 +1,5 @@
  config INFINIBAND_RDMAVT
         tristate "RDMA verbs transport library"
         depends on 64BIT
-       default m
         ---help---
         This is a common software verbs provider for RDMA networks.
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c

index 6ca6fa80dd6e7883c071003de9a10853dc593f10..f2f229efbe64d7d8ddcdf4f985ad23be76ec6977 100644 (file)
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -510,6 +510,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi)
  
         if (rdi->worker)
                 return 0;
+       spin_lock_init(&rdi->n_cqs_lock);
         rdi->worker = kzalloc(sizeof(*rdi->worker), GFP_KERNEL);
         if (!rdi->worker)
                 return -ENOMEM;
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c

index 0f4d4500f45e25597322e00cd744c4d05cda9831..80c4b6b401b83af99d8a1dde1ada784daeb8b6cb 100644 (file)
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -140,6 +140,7 @@ static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd,
         init_completion(&mr->comp);
         /* count returning the ptr to user */
         atomic_set(&mr->refcount, 1);
+       atomic_set(&mr->lkey_invalid, 0);
         mr->pd = pd;
         mr->max_segs = count;
         return 0;
@@ -479,6 +480,123 @@ struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
         return &mr->ibmr;
  }
  
+/**
+ * rvt_set_page - page assignment function called by ib_sg_to_pages
+ * @ibmr: memory region
+ * @addr: dma address of mapped page
+ *
+ * Return: 0 on success
+ */
+static int rvt_set_page(struct ib_mr *ibmr, u64 addr)
+{
+       struct rvt_mr *mr = to_imr(ibmr);
+       u32 ps = 1 << mr->mr.page_shift;
+       u32 mapped_segs = mr->mr.length >> mr->mr.page_shift;
+       int m, n;
+
+       if (unlikely(mapped_segs == mr->mr.max_segs))
+               return -ENOMEM;
+
+       if (mr->mr.length == 0) {
+               mr->mr.user_base = addr;
+               mr->mr.iova = addr;
+       }
+
+       m = mapped_segs / RVT_SEGSZ;
+       n = mapped_segs % RVT_SEGSZ;
+       mr->mr.map[m]->segs[n].vaddr = (void *)addr;
+       mr->mr.map[m]->segs[n].length = ps;
+       mr->mr.length += ps;
+
+       return 0;
+}
+
+/**
+ * rvt_map_mr_sg - map sg list and set it the memory region
+ * @ibmr: memory region
+ * @sg: dma mapped scatterlist
+ * @sg_nents: number of entries in sg
+ * @sg_offset: offset in bytes into sg
+ *
+ * Return: number of sg elements mapped to the memory region
+ */
+int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+                 int sg_nents, unsigned int *sg_offset)
+{
+       struct rvt_mr *mr = to_imr(ibmr);
+
+       mr->mr.length = 0;
+       mr->mr.page_shift = PAGE_SHIFT;
+       return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
+                             rvt_set_page);
+}
+
+/**
+ * rvt_fast_reg_mr - fast register physical MR
+ * @qp: the queue pair where the work request comes from
+ * @ibmr: the memory region to be registered
+ * @key: updated key for this memory region
+ * @access: access flags for this memory region
+ *
+ * Returns 0 on success.
+ */
+int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
+                   int access)
+{
+       struct rvt_mr *mr = to_imr(ibmr);
+
+       if (qp->ibqp.pd != mr->mr.pd)
+               return -EACCES;
+
+       /* not applicable to dma MR or user MR */
+       if (!mr->mr.lkey || mr->umem)
+               return -EINVAL;
+
+       if ((key & 0xFFFFFF00) != (mr->mr.lkey & 0xFFFFFF00))
+               return -EINVAL;
+
+       ibmr->lkey = key;
+       ibmr->rkey = key;
+       mr->mr.lkey = key;
+       mr->mr.access_flags = access;
+       atomic_set(&mr->mr.lkey_invalid, 0);
+
+       return 0;
+}
+EXPORT_SYMBOL(rvt_fast_reg_mr);
+
+/**
+ * rvt_invalidate_rkey - invalidate an MR rkey
+ * @qp: queue pair associated with the invalidate op
+ * @rkey: rkey to invalidate
+ *
+ * Returns 0 on success.
+ */
+int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey)
+{
+       struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
+       struct rvt_lkey_table *rkt = &dev->lkey_table;
+       struct rvt_mregion *mr;
+
+       if (rkey == 0)
+               return -EINVAL;
+
+       rcu_read_lock();
+       mr = rcu_dereference(
+               rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
+       if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+               goto bail;
+
+       atomic_set(&mr->lkey_invalid, 1);
+       rcu_read_unlock();
+       return 0;
+
+bail:
+       rcu_read_unlock();
+       return -EINVAL;
+}
+EXPORT_SYMBOL(rvt_invalidate_rkey);
+
  /**
   * rvt_alloc_fmr - allocate a fast memory region
   * @pd: the protection domain for this memory region
@@ -682,7 +800,8 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
         }
         mr = rcu_dereference(
                 rkt->table[(sge->lkey >> (32 - dev->dparms.lkey_table_size))]);
-       if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
+       if (unlikely(!mr || atomic_read(&mr->lkey_invalid) ||
+                    mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
                 goto bail;
  
         off = sge->addr - mr->user_base;
@@ -782,7 +901,8 @@ int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
  
         mr = rcu_dereference(
                 rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
-       if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+       if (unlikely(!mr || atomic_read(&mr->lkey_invalid) ||
+                    mr->lkey != rkey || qp->ibqp.pd != mr->pd))
                 goto bail;
  
         off = vaddr - mr->iova;
diff --git a/drivers/infiniband/sw/rdmavt/mr.h b/drivers/infiniband/sw/rdmavt/mr.h

index 69380512c6d16ce3769f43915c5d8d8606cbe14d..132800ee0205130ad73f3fcb3ae7f18a8cec6e7d 100644 (file)
--- a/drivers/infiniband/sw/rdmavt/mr.h
+++ b/drivers/infiniband/sw/rdmavt/mr.h
@@ -82,6 +82,8 @@ int rvt_dereg_mr(struct ib_mr *ibmr);
  struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
                            enum ib_mr_type mr_type,
                            u32 max_num_sg);
+int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+                 int sg_nents, unsigned int *sg_offset);
  struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
                              struct ib_fmr_attr *fmr_attr);
  int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c

index 41ba7e9cadaab29895fec78c0284ddfed2558949..bdb540f25a888dcc24b24cf96268c6d8f8a3c479 100644 (file)
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -435,8 +435,7 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
         for (n = 0; n < rvt_max_atomic(rdi); n++) {
                 struct rvt_ack_entry *e = &qp->s_ack_queue[n];
  
-               if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
-                   e->rdma_sge.mr) {
+               if (e->rdma_sge.mr) {
                         rvt_put_mr(e->rdma_sge.mr);
                         e->rdma_sge.mr = NULL;
                 }
@@ -584,6 +583,7 @@ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
                 qp->r_rq.wq->tail = 0;
         }
         qp->r_sge.num_sge = 0;
+       atomic_set(&qp->s_reserved_used, 0);
  }
  
  /**
@@ -613,6 +613,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
         struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
         void *priv = NULL;
         gfp_t gfp;
+       size_t sqsize;
  
         if (!rdi)
                 return ERR_PTR(-EINVAL);
@@ -643,7 +644,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                     init_attr->cap.max_recv_wr == 0)
                         return ERR_PTR(-EINVAL);
         }
-
+       sqsize =
+               init_attr->cap.max_send_wr + 1 +
+               rdi->dparms.reserved_operations;
         switch (init_attr->qp_type) {
         case IB_QPT_SMI:
         case IB_QPT_GSI:
@@ -658,11 +661,11 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                         sizeof(struct rvt_swqe);
                 if (gfp == GFP_NOIO)
                         swq = __vmalloc(
-                               (init_attr->cap.max_send_wr + 1) * sz,
+                               sqsize * sz,
                                 gfp | __GFP_ZERO, PAGE_KERNEL);
                 else
                         swq = vzalloc_node(
-                               (init_attr->cap.max_send_wr + 1) * sz,
+                               sqsize * sz,
                                 rdi->dparms.node);
                 if (!swq)
                         return ERR_PTR(-ENOMEM);
@@ -741,13 +744,14 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                 spin_lock_init(&qp->s_lock);
                 spin_lock_init(&qp->r_rq.lock);
                 atomic_set(&qp->refcount, 0);
+               atomic_set(&qp->local_ops_pending, 0);
                 init_waitqueue_head(&qp->wait);
                 init_timer(&qp->s_timer);
                 qp->s_timer.data = (unsigned long)qp;
                 INIT_LIST_HEAD(&qp->rspwait);
                 qp->state = IB_QPS_RESET;
                 qp->s_wq = swq;
-               qp->s_size = init_attr->cap.max_send_wr + 1;
+               qp->s_size = sqsize;
                 qp->s_avail = init_attr->cap.max_send_wr;
                 qp->s_max_sge = init_attr->cap.max_send_sge;
                 if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
@@ -1332,7 +1336,8 @@ int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
         attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask;
         attr->dest_qp_num = qp->remote_qpn;
         attr->qp_access_flags = qp->qp_access_flags;
-       attr->cap.max_send_wr = qp->s_size - 1;
+       attr->cap.max_send_wr = qp->s_size - 1 -
+               rdi->dparms.reserved_operations;
         attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
         attr->cap.max_send_sge = qp->s_max_sge;
         attr->cap.max_recv_sge = qp->r_rq.max_sge;
@@ -1440,25 +1445,116 @@ int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
  }
  
  /**
- * qp_get_savail - return number of avail send entries
+ * rvt_qp_valid_operation - validate post send wr request
+ * @qp - the qp
+ * @post-parms - the post send table for the driver
+ * @wr - the work request
+ *
+ * The routine validates the operation based on the
+ * validation table an returns the length of the operation
+ * which can extend beyond the ib_send_bw.  Operation
+ * dependent flags key atomic operation validation.
   *
+ * There is an exception for UD qps that validates the pd and
+ * overrides the length to include the additional UD specific
+ * length.
+ *
+ * Returns a negative error or the length of the work request
+ * for building the swqe.
+ */
+static inline int rvt_qp_valid_operation(
+       struct rvt_qp *qp,
+       const struct rvt_operation_params *post_parms,
+       struct ib_send_wr *wr)
+{
+       int len;
+
+       if (wr->opcode >= RVT_OPERATION_MAX || !post_parms[wr->opcode].length)
+               return -EINVAL;
+       if (!(post_parms[wr->opcode].qpt_support & BIT(qp->ibqp.qp_type)))
+               return -EINVAL;
+       if ((post_parms[wr->opcode].flags & RVT_OPERATION_PRIV) &&
+           ibpd_to_rvtpd(qp->ibqp.pd)->user)
+               return -EINVAL;
+       if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC_SGE &&
+           (wr->num_sge == 0 ||
+            wr->sg_list[0].length < sizeof(u64) ||
+            wr->sg_list[0].addr & (sizeof(u64) - 1)))
+               return -EINVAL;
+       if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC &&
+           !qp->s_max_rd_atomic)
+               return -EINVAL;
+       len = post_parms[wr->opcode].length;
+       /* UD specific */
+       if (qp->ibqp.qp_type != IB_QPT_UC &&
+           qp->ibqp.qp_type != IB_QPT_RC) {
+               if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
+                       return -EINVAL;
+               len = sizeof(struct ib_ud_wr);
+       }
+       return len;
+}
+
+/**
+ * rvt_qp_is_avail - determine queue capacity
   * @qp - the qp
+ * @rdi - the rdmavt device
+ * @reserved_op - is reserved operation
   *
   * This assumes the s_hlock is held but the s_last
   * qp variable is uncontrolled.
+ *
+ * For non reserved operations, the qp->s_avail
+ * may be changed.
+ *
+ * The return value is zero or a -ENOMEM.
   */
-static inline u32 qp_get_savail(struct rvt_qp *qp)
+static inline int rvt_qp_is_avail(
+       struct rvt_qp *qp,
+       struct rvt_dev_info *rdi,
+       bool reserved_op)
  {
         u32 slast;
-       u32 ret;
-
+       u32 avail;
+       u32 reserved_used;
+
+       /* see rvt_qp_wqe_unreserve() */
+       smp_mb__before_atomic();
+       reserved_used = atomic_read(&qp->s_reserved_used);
+       if (unlikely(reserved_op)) {
+               /* see rvt_qp_wqe_unreserve() */
+               smp_mb__before_atomic();
+               if (reserved_used >= rdi->dparms.reserved_operations)
+                       return -ENOMEM;
+               return 0;
+       }
+       /* non-reserved operations */
+       if (likely(qp->s_avail))
+               return 0;
         smp_read_barrier_depends(); /* see rc.c */
         slast = ACCESS_ONCE(qp->s_last);
         if (qp->s_head >= slast)
-               ret = qp->s_size - (qp->s_head - slast);
+               avail = qp->s_size - (qp->s_head - slast);
         else
-               ret = slast - qp->s_head;
-       return ret - 1;
+               avail = slast - qp->s_head;
+
+       /* see rvt_qp_wqe_unreserve() */
+       smp_mb__before_atomic();
+       reserved_used = atomic_read(&qp->s_reserved_used);
+       avail =  avail - 1 -
+               (rdi->dparms.reserved_operations - reserved_used);
+       /* insure we don't assign a negative s_avail */
+       if ((s32)avail <= 0)
+               return -ENOMEM;
+       qp->s_avail = avail;
+       if (WARN_ON(qp->s_avail >
+                   (qp->s_size - 1 - rdi->dparms.reserved_operations)))
+               rvt_pr_err(rdi,
+                          "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
+                          qp->ibqp.qp_num, qp->s_size, qp->s_avail,
+                          qp->s_head, qp->s_tail, qp->s_cur,
+                          qp->s_acked, qp->s_last);
+       return 0;
  }
  
  /**
@@ -1480,49 +1576,64 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
         struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
         u8 log_pmtu;
         int ret;
+       size_t cplen;
+       bool reserved_op;
+       int local_ops_delayed = 0;
+
+       BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE));
  
         /* IB spec says that num_sge == 0 is OK. */
         if (unlikely(wr->num_sge > qp->s_max_sge))
                 return -EINVAL;
  
+       ret = rvt_qp_valid_operation(qp, rdi->post_parms, wr);
+       if (ret < 0)
+               return ret;
+       cplen = ret;
+
         /*
-        * Don't allow RDMA reads or atomic operations on UC or
-        * undefined operations.
-        * Make sure buffer is large enough to hold the result for atomics.
+        * Local operations include fast register and local invalidate.
+        * Fast register needs to be processed immediately because the
+        * registered lkey may be used by following work requests and the
+        * lkey needs to be valid at the time those requests are posted.
+        * Local invalidate can be processed immediately if fencing is
+        * not required and no previous local invalidate ops are pending.
+        * Signaled local operations that have been processed immediately
+        * need to have requests with "completion only" flags set posted
+        * to the send queue in order to generate completions.
          */
-       if (qp->ibqp.qp_type == IB_QPT_UC) {
-               if ((unsigned)wr->opcode >= IB_WR_RDMA_READ)
-                       return -EINVAL;
-       } else if (qp->ibqp.qp_type != IB_QPT_RC) {
-               /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */
-               if (wr->opcode != IB_WR_SEND &&
-                   wr->opcode != IB_WR_SEND_WITH_IMM)
-                       return -EINVAL;
-               /* Check UD destination address PD */
-               if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
+       if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) {
+               switch (wr->opcode) {
+               case IB_WR_REG_MR:
+                       ret = rvt_fast_reg_mr(qp,
+                                             reg_wr(wr)->mr,
+                                             reg_wr(wr)->key,
+                                             reg_wr(wr)->access);
+                       if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
+                               return ret;
+                       break;
+               case IB_WR_LOCAL_INV:
+                       if ((wr->send_flags & IB_SEND_FENCE) ||
+                           atomic_read(&qp->local_ops_pending)) {
+                               local_ops_delayed = 1;
+                       } else {
+                               ret = rvt_invalidate_rkey(
+                                       qp, wr->ex.invalidate_rkey);
+                               if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
+                                       return ret;
+                       }
+                       break;
+               default:
                         return -EINVAL;
-       } else if ((unsigned)wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) {
-               return -EINVAL;
-       } else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
-                  (wr->num_sge == 0 ||
-                   wr->sg_list[0].length < sizeof(u64) ||
-                   wr->sg_list[0].addr & (sizeof(u64) - 1))) {
-               return -EINVAL;
-       } else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) {
-               return -EINVAL;
+               }
         }
+
+       reserved_op = rdi->post_parms[wr->opcode].flags &
+                       RVT_OPERATION_USE_RESERVE;
         /* check for avail */
-       if (unlikely(!qp->s_avail)) {
-               qp->s_avail = qp_get_savail(qp);
-               if (WARN_ON(qp->s_avail > (qp->s_size - 1)))
-                       rvt_pr_err(rdi,
-                                  "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
-                                  qp->ibqp.qp_num, qp->s_size, qp->s_avail,
-                                  qp->s_head, qp->s_tail, qp->s_cur,
-                                  qp->s_acked, qp->s_last);
-               if (!qp->s_avail)
-                       return -ENOMEM;
-       }
+       ret = rvt_qp_is_avail(qp, rdi, reserved_op);
+       if (ret)
+               return ret;
         next = qp->s_head + 1;
         if (next >= qp->s_size)
                 next = 0;
@@ -1531,18 +1642,8 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
         pd = ibpd_to_rvtpd(qp->ibqp.pd);
         wqe = rvt_get_swqe_ptr(qp, qp->s_head);
  
-       if (qp->ibqp.qp_type != IB_QPT_UC &&
-           qp->ibqp.qp_type != IB_QPT_RC)
-               memcpy(&wqe->ud_wr, ud_wr(wr), sizeof(wqe->ud_wr));
-       else if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
-                wr->opcode == IB_WR_RDMA_WRITE ||
-                wr->opcode == IB_WR_RDMA_READ)
-               memcpy(&wqe->rdma_wr, rdma_wr(wr), sizeof(wqe->rdma_wr));
-       else if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
-               memcpy(&wqe->atomic_wr, atomic_wr(wr), sizeof(wqe->atomic_wr));
-       else
-               memcpy(&wqe->wr, wr, sizeof(wqe->wr));
+       /* cplen has length from above */
+       memcpy(&wqe->wr, wr, cplen);
  
         wqe->length = 0;
         j = 0;
@@ -1585,14 +1686,29 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
                 atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
         }
  
-       wqe->ssn = qp->s_ssn++;
-       wqe->psn = qp->s_next_psn;
-       wqe->lpsn = wqe->psn +
-                       (wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0);
-       qp->s_next_psn = wqe->lpsn + 1;
+       if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
+               if (local_ops_delayed)
+                       atomic_inc(&qp->local_ops_pending);
+               else
+                       wqe->wr.send_flags |= RVT_SEND_COMPLETION_ONLY;
+               wqe->ssn = 0;
+               wqe->psn = 0;
+               wqe->lpsn = 0;
+       } else {
+               wqe->ssn = qp->s_ssn++;
+               wqe->psn = qp->s_next_psn;
+               wqe->lpsn = wqe->psn +
+                               (wqe->length ?
+                                       ((wqe->length - 1) >> log_pmtu) :
+                                       0);
+               qp->s_next_psn = wqe->lpsn + 1;
+       }
         trace_rvt_post_one_wr(qp, wqe);
+       if (unlikely(reserved_op))
+               rvt_qp_wqe_reserve(qp, wqe);
+       else
+               qp->s_avail--;
         smp_wmb(); /* see request builders */
-       qp->s_avail--;
         qp->s_head = next;
  
         return 0;
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c

index 30c4fda7a05a6552e5a26d5dc1e52af200959a17..d430c2f7cec4cea4fc24f465dedd0b30e27f29ce 100644 (file)
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -370,6 +370,7 @@ enum {
         REG_USER_MR,
         DEREG_MR,
         ALLOC_MR,
+       MAP_MR_SG,
         ALLOC_FMR,
         MAP_PHYS_FMR,
         UNMAP_FMR,
@@ -528,7 +529,8 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
                                                          post_send),
                                            rvt_post_send))
                         if (!rdi->driver_f.schedule_send ||
-                           !rdi->driver_f.do_send)
+                           !rdi->driver_f.do_send ||
+                           !rdi->post_parms)
                                 return -EINVAL;
                 break;
  
@@ -633,6 +635,12 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
                                       rvt_alloc_mr);
                 break;
  
+       case MAP_MR_SG:
+               check_driver_override(rdi, offsetof(struct ib_device,
+                                                   map_mr_sg),
+                                     rvt_map_mr_sg);
+               break;
+
         case MAP_PHYS_FMR:
                 check_driver_override(rdi, offsetof(struct ib_device,
                                                     map_phys_fmr),
diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig

new file mode 100644 (file)

index 0000000..1e4e628
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/Kconfig
@@ -0,0 +1,24 @@
+config RDMA_RXE
+       tristate "Software RDMA over Ethernet (RoCE) driver"
+       depends on INET && PCI && INFINIBAND
+       depends on NET_UDP_TUNNEL
+       ---help---
+       This driver implements the InfiniBand RDMA transport over
+       the Linux network stack. It enables a system with a
+       standard Ethernet adapter to interoperate with a RoCE
+       adapter or with another system running the RXE driver.
+       Documentation on InfiniBand and RoCE can be downloaded at
+       www.infinibandta.org and www.openfabrics.org. (See also
+       siw which is a similar software driver for iWARP.)
+
+       The driver is split into two layers, one interfaces with the
+       Linux RDMA stack and implements a kernel or user space
+       verbs API. The user space verbs API requires a support
+       library named librxe which is loaded by the generic user
+       space verbs API, libibverbs. The other layer interfaces
+       with the Linux network stack at layer 3.
+
+       To configure and work with soft-RoCE driver please use the
+       following wiki page under "configure Soft-RoCE (RXE)" section:
+
+       https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home
diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile

new file mode 100644 (file)

index 0000000..3b3fb9d
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/Makefile
@@ -0,0 +1,24 @@
+obj-$(CONFIG_RDMA_RXE) += rdma_rxe.o
+
+rdma_rxe-y := \
+       rxe.o \
+       rxe_comp.o \
+       rxe_req.o \
+       rxe_resp.o \
+       rxe_recv.o \
+       rxe_pool.o \
+       rxe_queue.o \
+       rxe_verbs.o \
+       rxe_av.o \
+       rxe_srq.o \
+       rxe_qp.o \
+       rxe_cq.o \
+       rxe_mr.o \
+       rxe_dma.o \
+       rxe_opcode.o \
+       rxe_mmap.o \
+       rxe_icrc.o \
+       rxe_mcast.o \
+       rxe_task.o \
+       rxe_net.o \
+       rxe_sysfs.o
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c

new file mode 100644 (file)

index 0000000..55f0e8f
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib");
+MODULE_DESCRIPTION("Soft RDMA transport");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION("0.2");
+
+/* free resources for all ports on a device */
+static void rxe_cleanup_ports(struct rxe_dev *rxe)
+{
+       kfree(rxe->port.pkey_tbl);
+       rxe->port.pkey_tbl = NULL;
+
+}
+
+/* free resources for a rxe device all objects created for this device must
+ * have been destroyed
+ */
+static void rxe_cleanup(struct rxe_dev *rxe)
+{
+       rxe_pool_cleanup(&rxe->uc_pool);
+       rxe_pool_cleanup(&rxe->pd_pool);
+       rxe_pool_cleanup(&rxe->ah_pool);
+       rxe_pool_cleanup(&rxe->srq_pool);
+       rxe_pool_cleanup(&rxe->qp_pool);
+       rxe_pool_cleanup(&rxe->cq_pool);
+       rxe_pool_cleanup(&rxe->mr_pool);
+       rxe_pool_cleanup(&rxe->mw_pool);
+       rxe_pool_cleanup(&rxe->mc_grp_pool);
+       rxe_pool_cleanup(&rxe->mc_elem_pool);
+
+       rxe_cleanup_ports(rxe);
+}
+
+/* called when all references have been dropped */
+void rxe_release(struct kref *kref)
+{
+       struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt);
+
+       rxe_cleanup(rxe);
+       ib_dealloc_device(&rxe->ib_dev);
+}
+
+void rxe_dev_put(struct rxe_dev *rxe)
+{
+       kref_put(&rxe->ref_cnt, rxe_release);
+}
+EXPORT_SYMBOL_GPL(rxe_dev_put);
+
+/* initialize rxe device parameters */
+static int rxe_init_device_param(struct rxe_dev *rxe)
+{
+       rxe->max_inline_data                    = RXE_MAX_INLINE_DATA;
+
+       rxe->attr.fw_ver                        = RXE_FW_VER;
+       rxe->attr.max_mr_size                   = RXE_MAX_MR_SIZE;
+       rxe->attr.page_size_cap                 = RXE_PAGE_SIZE_CAP;
+       rxe->attr.vendor_id                     = RXE_VENDOR_ID;
+       rxe->attr.vendor_part_id                = RXE_VENDOR_PART_ID;
+       rxe->attr.hw_ver                        = RXE_HW_VER;
+       rxe->attr.max_qp                        = RXE_MAX_QP;
+       rxe->attr.max_qp_wr                     = RXE_MAX_QP_WR;
+       rxe->attr.device_cap_flags              = RXE_DEVICE_CAP_FLAGS;
+       rxe->attr.max_sge                       = RXE_MAX_SGE;
+       rxe->attr.max_sge_rd                    = RXE_MAX_SGE_RD;
+       rxe->attr.max_cq                        = RXE_MAX_CQ;
+       rxe->attr.max_cqe                       = (1 << RXE_MAX_LOG_CQE) - 1;
+       rxe->attr.max_mr                        = RXE_MAX_MR;
+       rxe->attr.max_pd                        = RXE_MAX_PD;
+       rxe->attr.max_qp_rd_atom                = RXE_MAX_QP_RD_ATOM;
+       rxe->attr.max_ee_rd_atom                = RXE_MAX_EE_RD_ATOM;
+       rxe->attr.max_res_rd_atom               = RXE_MAX_RES_RD_ATOM;
+       rxe->attr.max_qp_init_rd_atom           = RXE_MAX_QP_INIT_RD_ATOM;
+       rxe->attr.max_ee_init_rd_atom           = RXE_MAX_EE_INIT_RD_ATOM;
+       rxe->attr.atomic_cap                    = RXE_ATOMIC_CAP;
+       rxe->attr.max_ee                        = RXE_MAX_EE;
+       rxe->attr.max_rdd                       = RXE_MAX_RDD;
+       rxe->attr.max_mw                        = RXE_MAX_MW;
+       rxe->attr.max_raw_ipv6_qp               = RXE_MAX_RAW_IPV6_QP;
+       rxe->attr.max_raw_ethy_qp               = RXE_MAX_RAW_ETHY_QP;
+       rxe->attr.max_mcast_grp                 = RXE_MAX_MCAST_GRP;
+       rxe->attr.max_mcast_qp_attach           = RXE_MAX_MCAST_QP_ATTACH;
+       rxe->attr.max_total_mcast_qp_attach     = RXE_MAX_TOT_MCAST_QP_ATTACH;
+       rxe->attr.max_ah                        = RXE_MAX_AH;
+       rxe->attr.max_fmr                       = RXE_MAX_FMR;
+       rxe->attr.max_map_per_fmr               = RXE_MAX_MAP_PER_FMR;
+       rxe->attr.max_srq                       = RXE_MAX_SRQ;
+       rxe->attr.max_srq_wr                    = RXE_MAX_SRQ_WR;
+       rxe->attr.max_srq_sge                   = RXE_MAX_SRQ_SGE;
+       rxe->attr.max_fast_reg_page_list_len    = RXE_MAX_FMR_PAGE_LIST_LEN;
+       rxe->attr.max_pkeys                     = RXE_MAX_PKEYS;
+       rxe->attr.local_ca_ack_delay            = RXE_LOCAL_CA_ACK_DELAY;
+
+       rxe->max_ucontext                       = RXE_MAX_UCONTEXT;
+
+       return 0;
+}
+
+/* initialize port attributes */
+static int rxe_init_port_param(struct rxe_port *port)
+{
+       port->attr.state                = RXE_PORT_STATE;
+       port->attr.max_mtu              = RXE_PORT_MAX_MTU;
+       port->attr.active_mtu           = RXE_PORT_ACTIVE_MTU;
+       port->attr.gid_tbl_len          = RXE_PORT_GID_TBL_LEN;
+       port->attr.port_cap_flags       = RXE_PORT_PORT_CAP_FLAGS;
+       port->attr.max_msg_sz           = RXE_PORT_MAX_MSG_SZ;
+       port->attr.bad_pkey_cntr        = RXE_PORT_BAD_PKEY_CNTR;
+       port->attr.qkey_viol_cntr       = RXE_PORT_QKEY_VIOL_CNTR;
+       port->attr.pkey_tbl_len         = RXE_PORT_PKEY_TBL_LEN;
+       port->attr.lid                  = RXE_PORT_LID;
+       port->attr.sm_lid               = RXE_PORT_SM_LID;
+       port->attr.lmc                  = RXE_PORT_LMC;
+       port->attr.max_vl_num           = RXE_PORT_MAX_VL_NUM;
+       port->attr.sm_sl                = RXE_PORT_SM_SL;
+       port->attr.subnet_timeout       = RXE_PORT_SUBNET_TIMEOUT;
+       port->attr.init_type_reply      = RXE_PORT_INIT_TYPE_REPLY;
+       port->attr.active_width         = RXE_PORT_ACTIVE_WIDTH;
+       port->attr.active_speed         = RXE_PORT_ACTIVE_SPEED;
+       port->attr.phys_state           = RXE_PORT_PHYS_STATE;
+       port->mtu_cap                   =
+                               ib_mtu_enum_to_int(RXE_PORT_ACTIVE_MTU);
+       port->subnet_prefix             = cpu_to_be64(RXE_PORT_SUBNET_PREFIX);
+
+       return 0;
+}
+
+/* initialize port state, note IB convention that HCA ports are always
+ * numbered from 1
+ */
+static int rxe_init_ports(struct rxe_dev *rxe)
+{
+       struct rxe_port *port = &rxe->port;
+
+       rxe_init_port_param(port);
+
+       if (!port->attr.pkey_tbl_len || !port->attr.gid_tbl_len)
+               return -EINVAL;
+
+       port->pkey_tbl = kcalloc(port->attr.pkey_tbl_len,
+                       sizeof(*port->pkey_tbl), GFP_KERNEL);
+
+       if (!port->pkey_tbl)
+               return -ENOMEM;
+
+       port->pkey_tbl[0] = 0xffff;
+       port->port_guid = rxe->ifc_ops->port_guid(rxe);
+
+       spin_lock_init(&port->port_lock);
+
+       return 0;
+}
+
+/* init pools of managed objects */
+static int rxe_init_pools(struct rxe_dev *rxe)
+{
+       int err;
+
+       err = rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC,
+                           rxe->max_ucontext);
+       if (err)
+               goto err1;
+
+       err = rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD,
+                           rxe->attr.max_pd);
+       if (err)
+               goto err2;
+
+       err = rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH,
+                           rxe->attr.max_ah);
+       if (err)
+               goto err3;
+
+       err = rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ,
+                           rxe->attr.max_srq);
+       if (err)
+               goto err4;
+
+       err = rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP,
+                           rxe->attr.max_qp);
+       if (err)
+               goto err5;
+
+       err = rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ,
+                           rxe->attr.max_cq);
+       if (err)
+               goto err6;
+
+       err = rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR,
+                           rxe->attr.max_mr);
+       if (err)
+               goto err7;
+
+       err = rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW,
+                           rxe->attr.max_mw);
+       if (err)
+               goto err8;
+
+       err = rxe_pool_init(rxe, &rxe->mc_grp_pool, RXE_TYPE_MC_GRP,
+                           rxe->attr.max_mcast_grp);
+       if (err)
+               goto err9;
+
+       err = rxe_pool_init(rxe, &rxe->mc_elem_pool, RXE_TYPE_MC_ELEM,
+                           rxe->attr.max_total_mcast_qp_attach);
+       if (err)
+               goto err10;
+
+       return 0;
+
+err10:
+       rxe_pool_cleanup(&rxe->mc_grp_pool);
+err9:
+       rxe_pool_cleanup(&rxe->mw_pool);
+err8:
+       rxe_pool_cleanup(&rxe->mr_pool);
+err7:
+       rxe_pool_cleanup(&rxe->cq_pool);
+err6:
+       rxe_pool_cleanup(&rxe->qp_pool);
+err5:
+       rxe_pool_cleanup(&rxe->srq_pool);
+err4:
+       rxe_pool_cleanup(&rxe->ah_pool);
+err3:
+       rxe_pool_cleanup(&rxe->pd_pool);
+err2:
+       rxe_pool_cleanup(&rxe->uc_pool);
+err1:
+       return err;
+}
+
+/* initialize rxe device state */
+static int rxe_init(struct rxe_dev *rxe)
+{
+       int err;
+
+       /* init default device parameters */
+       rxe_init_device_param(rxe);
+
+       err = rxe_init_ports(rxe);
+       if (err)
+               goto err1;
+
+       err = rxe_init_pools(rxe);
+       if (err)
+               goto err2;
+
+       /* init pending mmap list */
+       spin_lock_init(&rxe->mmap_offset_lock);
+       spin_lock_init(&rxe->pending_lock);
+       INIT_LIST_HEAD(&rxe->pending_mmaps);
+       INIT_LIST_HEAD(&rxe->list);
+
+       mutex_init(&rxe->usdev_lock);
+
+       return 0;
+
+err2:
+       rxe_cleanup_ports(rxe);
+err1:
+       return err;
+}
+
+int rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
+{
+       struct rxe_port *port = &rxe->port;
+       enum ib_mtu mtu;
+
+       mtu = eth_mtu_int_to_enum(ndev_mtu);
+
+       /* Make sure that new MTU in range */
+       mtu = mtu ? min_t(enum ib_mtu, mtu, RXE_PORT_MAX_MTU) : IB_MTU_256;
+
+       port->attr.active_mtu = mtu;
+       port->mtu_cap = ib_mtu_enum_to_int(mtu);
+
+       return 0;
+}
+EXPORT_SYMBOL(rxe_set_mtu);
+
+/* called by ifc layer to create new rxe device.
+ * The caller should allocate memory for rxe by calling ib_alloc_device.
+ */
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
+{
+       int err;
+
+       kref_init(&rxe->ref_cnt);
+
+       err = rxe_init(rxe);
+       if (err)
+               goto err1;
+
+       err = rxe_set_mtu(rxe, mtu);
+       if (err)
+               goto err1;
+
+       err = rxe_register_device(rxe);
+       if (err)
+               goto err1;
+
+       return 0;
+
+err1:
+       rxe_dev_put(rxe);
+       return err;
+}
+EXPORT_SYMBOL(rxe_add);
+
+/* called by the ifc layer to remove a device */
+void rxe_remove(struct rxe_dev *rxe)
+{
+       rxe_unregister_device(rxe);
+
+       rxe_dev_put(rxe);
+}
+EXPORT_SYMBOL(rxe_remove);
+
+static int __init rxe_module_init(void)
+{
+       int err;
+
+       /* initialize slab caches for managed objects */
+       err = rxe_cache_init();
+       if (err) {
+               pr_err("rxe: unable to init object pools\n");
+               return err;
+       }
+
+       err = rxe_net_init();
+       if (err) {
+               pr_err("rxe: unable to init\n");
+               rxe_cache_exit();
+               return err;
+       }
+       pr_info("rxe: loaded\n");
+
+       return 0;
+}
+
+static void __exit rxe_module_exit(void)
+{
+       rxe_remove_all();
+       rxe_net_exit();
+       rxe_cache_exit();
+
+       pr_info("rxe: unloaded\n");
+}
+
+module_init(rxe_module_init);
+module_exit(rxe_module_exit);
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h

new file mode 100644 (file)

index 0000000..12c71c5
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_H
+#define RXE_H
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/crc32.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+#include "rxe_net.h"
+#include "rxe_opcode.h"
+#include "rxe_hdr.h"
+#include "rxe_param.h"
+#include "rxe_verbs.h"
+
+#define RXE_UVERBS_ABI_VERSION         (1)
+
+#define IB_PHYS_STATE_LINK_UP          (5)
+#define IB_PHYS_STATE_LINK_DOWN                (3)
+
+#define RXE_ROCE_V2_SPORT              (0xc000)
+
+int rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
+
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu);
+void rxe_remove(struct rxe_dev *rxe);
+void rxe_remove_all(void);
+
+int rxe_rcv(struct sk_buff *skb);
+
+void rxe_dev_put(struct rxe_dev *rxe);
+struct rxe_dev *net_to_rxe(struct net_device *ndev);
+struct rxe_dev *get_rxe_by_name(const char* name);
+
+void rxe_port_up(struct rxe_dev *rxe);
+void rxe_port_down(struct rxe_dev *rxe);
+
+#endif /* RXE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c

new file mode 100644 (file)

index 0000000..5c94742
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_av.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *             - Redistributions of source code must retain the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer.
+ *
+ *             - Redistributions in binary form must reproduce the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer in the documentation and/or other materials
+ *               provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+int rxe_av_chk_attr(struct rxe_dev *rxe, struct ib_ah_attr *attr)
+{
+       struct rxe_port *port;
+
+       if (attr->port_num != 1) {
+               pr_info("rxe: invalid port_num = %d\n", attr->port_num);
+               return -EINVAL;
+       }
+
+       port = &rxe->port;
+
+       if (attr->ah_flags & IB_AH_GRH) {
+               if (attr->grh.sgid_index > port->attr.gid_tbl_len) {
+                       pr_info("rxe: invalid sgid index = %d\n",
+                               attr->grh.sgid_index);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+int rxe_av_from_attr(struct rxe_dev *rxe, u8 port_num,
+                    struct rxe_av *av, struct ib_ah_attr *attr)
+{
+       memset(av, 0, sizeof(*av));
+       memcpy(&av->grh, &attr->grh, sizeof(attr->grh));
+       av->port_num = port_num;
+       return 0;
+}
+
+int rxe_av_to_attr(struct rxe_dev *rxe, struct rxe_av *av,
+                  struct ib_ah_attr *attr)
+{
+       memcpy(&attr->grh, &av->grh, sizeof(av->grh));
+       attr->port_num = av->port_num;
+       return 0;
+}
+
+int rxe_av_fill_ip_info(struct rxe_dev *rxe,
+                       struct rxe_av *av,
+                       struct ib_ah_attr *attr,
+                       struct ib_gid_attr *sgid_attr,
+                       union ib_gid *sgid)
+{
+       rdma_gid2ip(&av->sgid_addr._sockaddr, sgid);
+       rdma_gid2ip(&av->dgid_addr._sockaddr, &attr->grh.dgid);
+       av->network_type = ib_gid_to_network_type(sgid_attr->gid_type, sgid);
+
+       return 0;
+}
+
+struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt)
+{
+       if (!pkt || !pkt->qp)
+               return NULL;
+
+       if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC)
+               return &pkt->qp->pri_av;
+
+       return (pkt->wqe) ? &pkt->wqe->av : NULL;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c

new file mode 100644 (file)

index 0000000..36f67de
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+#include "rxe_task.h"
+
+enum comp_state {
+       COMPST_GET_ACK,
+       COMPST_GET_WQE,
+       COMPST_COMP_WQE,
+       COMPST_COMP_ACK,
+       COMPST_CHECK_PSN,
+       COMPST_CHECK_ACK,
+       COMPST_READ,
+       COMPST_ATOMIC,
+       COMPST_WRITE_SEND,
+       COMPST_UPDATE_COMP,
+       COMPST_ERROR_RETRY,
+       COMPST_RNR_RETRY,
+       COMPST_ERROR,
+       COMPST_EXIT, /* We have an issue, and we want to rerun the completer */
+       COMPST_DONE, /* The completer finished successflly */
+};
+
+static char *comp_state_name[] =  {
+       [COMPST_GET_ACK]                = "GET ACK",
+       [COMPST_GET_WQE]                = "GET WQE",
+       [COMPST_COMP_WQE]               = "COMP WQE",
+       [COMPST_COMP_ACK]               = "COMP ACK",
+       [COMPST_CHECK_PSN]              = "CHECK PSN",
+       [COMPST_CHECK_ACK]              = "CHECK ACK",
+       [COMPST_READ]                   = "READ",
+       [COMPST_ATOMIC]                 = "ATOMIC",
+       [COMPST_WRITE_SEND]             = "WRITE/SEND",
+       [COMPST_UPDATE_COMP]            = "UPDATE COMP",
+       [COMPST_ERROR_RETRY]            = "ERROR RETRY",
+       [COMPST_RNR_RETRY]              = "RNR RETRY",
+       [COMPST_ERROR]                  = "ERROR",
+       [COMPST_EXIT]                   = "EXIT",
+       [COMPST_DONE]                   = "DONE",
+};
+
+static unsigned long rnrnak_usec[32] = {
+       [IB_RNR_TIMER_655_36] = 655360,
+       [IB_RNR_TIMER_000_01] = 10,
+       [IB_RNR_TIMER_000_02] = 20,
+       [IB_RNR_TIMER_000_03] = 30,
+       [IB_RNR_TIMER_000_04] = 40,
+       [IB_RNR_TIMER_000_06] = 60,
+       [IB_RNR_TIMER_000_08] = 80,
+       [IB_RNR_TIMER_000_12] = 120,
+       [IB_RNR_TIMER_000_16] = 160,
+       [IB_RNR_TIMER_000_24] = 240,
+       [IB_RNR_TIMER_000_32] = 320,
+       [IB_RNR_TIMER_000_48] = 480,
+       [IB_RNR_TIMER_000_64] = 640,
+       [IB_RNR_TIMER_000_96] = 960,
+       [IB_RNR_TIMER_001_28] = 1280,
+       [IB_RNR_TIMER_001_92] = 1920,
+       [IB_RNR_TIMER_002_56] = 2560,
+       [IB_RNR_TIMER_003_84] = 3840,
+       [IB_RNR_TIMER_005_12] = 5120,
+       [IB_RNR_TIMER_007_68] = 7680,
+       [IB_RNR_TIMER_010_24] = 10240,
+       [IB_RNR_TIMER_015_36] = 15360,
+       [IB_RNR_TIMER_020_48] = 20480,
+       [IB_RNR_TIMER_030_72] = 30720,
+       [IB_RNR_TIMER_040_96] = 40960,
+       [IB_RNR_TIMER_061_44] = 61410,
+       [IB_RNR_TIMER_081_92] = 81920,
+       [IB_RNR_TIMER_122_88] = 122880,
+       [IB_RNR_TIMER_163_84] = 163840,
+       [IB_RNR_TIMER_245_76] = 245760,
+       [IB_RNR_TIMER_327_68] = 327680,
+       [IB_RNR_TIMER_491_52] = 491520,
+};
+
+static inline unsigned long rnrnak_jiffies(u8 timeout)
+{
+       return max_t(unsigned long,
+               usecs_to_jiffies(rnrnak_usec[timeout]), 1);
+}
+
+static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode)
+{
+       switch (opcode) {
+       case IB_WR_RDMA_WRITE:                  return IB_WC_RDMA_WRITE;
+       case IB_WR_RDMA_WRITE_WITH_IMM:         return IB_WC_RDMA_WRITE;
+       case IB_WR_SEND:                        return IB_WC_SEND;
+       case IB_WR_SEND_WITH_IMM:               return IB_WC_SEND;
+       case IB_WR_RDMA_READ:                   return IB_WC_RDMA_READ;
+       case IB_WR_ATOMIC_CMP_AND_SWP:          return IB_WC_COMP_SWAP;
+       case IB_WR_ATOMIC_FETCH_AND_ADD:        return IB_WC_FETCH_ADD;
+       case IB_WR_LSO:                         return IB_WC_LSO;
+       case IB_WR_SEND_WITH_INV:               return IB_WC_SEND;
+       case IB_WR_RDMA_READ_WITH_INV:          return IB_WC_RDMA_READ;
+       case IB_WR_LOCAL_INV:                   return IB_WC_LOCAL_INV;
+       case IB_WR_REG_MR:                      return IB_WC_REG_MR;
+
+       default:
+               return 0xff;
+       }
+}
+
+void retransmit_timer(unsigned long data)
+{
+       struct rxe_qp *qp = (struct rxe_qp *)data;
+
+       if (qp->valid) {
+               qp->comp.timeout = 1;
+               rxe_run_task(&qp->comp.task, 1);
+       }
+}
+
+void rxe_comp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
+                       struct sk_buff *skb)
+{
+       int must_sched;
+
+       skb_queue_tail(&qp->resp_pkts, skb);
+
+       must_sched = skb_queue_len(&qp->resp_pkts) > 1;
+       rxe_run_task(&qp->comp.task, must_sched);
+}
+
+static inline enum comp_state get_wqe(struct rxe_qp *qp,
+                                     struct rxe_pkt_info *pkt,
+                                     struct rxe_send_wqe **wqe_p)
+{
+       struct rxe_send_wqe *wqe;
+
+       /* we come here whether or not we found a response packet to see if
+        * there are any posted WQEs
+        */
+       wqe = queue_head(qp->sq.queue);
+       *wqe_p = wqe;
+
+       /* no WQE or requester has not started it yet */
+       if (!wqe || wqe->state == wqe_state_posted)
+               return pkt ? COMPST_DONE : COMPST_EXIT;
+
+       /* WQE does not require an ack */
+       if (wqe->state == wqe_state_done)
+               return COMPST_COMP_WQE;
+
+       /* WQE caused an error */
+       if (wqe->state == wqe_state_error)
+               return COMPST_ERROR;
+
+       /* we have a WQE, if we also have an ack check its PSN */
+       return pkt ? COMPST_CHECK_PSN : COMPST_EXIT;
+}
+
+static inline void reset_retry_counters(struct rxe_qp *qp)
+{
+       qp->comp.retry_cnt = qp->attr.retry_cnt;
+       qp->comp.rnr_retry = qp->attr.rnr_retry;
+}
+
+static inline enum comp_state check_psn(struct rxe_qp *qp,
+                                       struct rxe_pkt_info *pkt,
+                                       struct rxe_send_wqe *wqe)
+{
+       s32 diff;
+
+       /* check to see if response is past the oldest WQE. if it is, complete
+        * send/write or error read/atomic
+        */
+       diff = psn_compare(pkt->psn, wqe->last_psn);
+       if (diff > 0) {
+               if (wqe->state == wqe_state_pending) {
+                       if (wqe->mask & WR_ATOMIC_OR_READ_MASK)
+                               return COMPST_ERROR_RETRY;
+
+                       reset_retry_counters(qp);
+                       return COMPST_COMP_WQE;
+               } else {
+                       return COMPST_DONE;
+               }
+       }
+
+       /* compare response packet to expected response */
+       diff = psn_compare(pkt->psn, qp->comp.psn);
+       if (diff < 0) {
+               /* response is most likely a retried packet if it matches an
+                * uncompleted WQE go complete it else ignore it
+                */
+               if (pkt->psn == wqe->last_psn)
+                       return COMPST_COMP_ACK;
+               else
+                       return COMPST_DONE;
+       } else if ((diff > 0) && (wqe->mask & WR_ATOMIC_OR_READ_MASK)) {
+               return COMPST_ERROR_RETRY;
+       } else {
+               return COMPST_CHECK_ACK;
+       }
+}
+
+static inline enum comp_state check_ack(struct rxe_qp *qp,
+                                       struct rxe_pkt_info *pkt,
+                                       struct rxe_send_wqe *wqe)
+{
+       unsigned int mask = pkt->mask;
+       u8 syn;
+
+       /* Check the sequence only */
+       switch (qp->comp.opcode) {
+       case -1:
+               /* Will catch all *_ONLY cases. */
+               if (!(mask & RXE_START_MASK))
+                       return COMPST_ERROR;
+
+               break;
+
+       case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+       case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+               if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE &&
+                   pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) {
+                       return COMPST_ERROR;
+               }
+               break;
+       default:
+               WARN_ON(1);
+       }
+
+       /* Check operation validity. */
+       switch (pkt->opcode) {
+       case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+       case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST:
+       case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY:
+               syn = aeth_syn(pkt);
+
+               if ((syn & AETH_TYPE_MASK) != AETH_ACK)
+                       return COMPST_ERROR;
+
+               /* Fall through (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE
+                * doesn't have an AETH)
+                */
+       case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+               if (wqe->wr.opcode != IB_WR_RDMA_READ &&
+                   wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV) {
+                       return COMPST_ERROR;
+               }
+               reset_retry_counters(qp);
+               return COMPST_READ;
+
+       case IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE:
+               syn = aeth_syn(pkt);
+
+               if ((syn & AETH_TYPE_MASK) != AETH_ACK)
+                       return COMPST_ERROR;
+
+               if (wqe->wr.opcode != IB_WR_ATOMIC_CMP_AND_SWP &&
+                   wqe->wr.opcode != IB_WR_ATOMIC_FETCH_AND_ADD)
+                       return COMPST_ERROR;
+               reset_retry_counters(qp);
+               return COMPST_ATOMIC;
+
+       case IB_OPCODE_RC_ACKNOWLEDGE:
+               syn = aeth_syn(pkt);
+               switch (syn & AETH_TYPE_MASK) {
+               case AETH_ACK:
+                       reset_retry_counters(qp);
+                       return COMPST_WRITE_SEND;
+
+               case AETH_RNR_NAK:
+                       return COMPST_RNR_RETRY;
+
+               case AETH_NAK:
+                       switch (syn) {
+                       case AETH_NAK_PSN_SEQ_ERROR:
+                               /* a nak implicitly acks all packets with psns
+                                * before
+                                */
+                               if (psn_compare(pkt->psn, qp->comp.psn) > 0) {
+                                       qp->comp.psn = pkt->psn;
+                                       if (qp->req.wait_psn) {
+                                               qp->req.wait_psn = 0;
+                                               rxe_run_task(&qp->req.task, 1);
+                                       }
+                               }
+                               return COMPST_ERROR_RETRY;
+
+                       case AETH_NAK_INVALID_REQ:
+                               wqe->status = IB_WC_REM_INV_REQ_ERR;
+                               return COMPST_ERROR;
+
+                       case AETH_NAK_REM_ACC_ERR:
+                               wqe->status = IB_WC_REM_ACCESS_ERR;
+                               return COMPST_ERROR;
+
+                       case AETH_NAK_REM_OP_ERR:
+                               wqe->status = IB_WC_REM_OP_ERR;
+                               return COMPST_ERROR;
+
+                       default:
+                               pr_warn("unexpected nak %x\n", syn);
+                               wqe->status = IB_WC_REM_OP_ERR;
+                               return COMPST_ERROR;
+                       }
+
+               default:
+                       return COMPST_ERROR;
+               }
+               break;
+
+       default:
+               pr_warn("unexpected opcode\n");
+       }
+
+       return COMPST_ERROR;
+}
+
+static inline enum comp_state do_read(struct rxe_qp *qp,
+                                     struct rxe_pkt_info *pkt,
+                                     struct rxe_send_wqe *wqe)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       int ret;
+
+       ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE,
+                       &wqe->dma, payload_addr(pkt),
+                       payload_size(pkt), to_mem_obj, NULL);
+       if (ret)
+               return COMPST_ERROR;
+
+       if (wqe->dma.resid == 0 && (pkt->mask & RXE_END_MASK))
+               return COMPST_COMP_ACK;
+       else
+               return COMPST_UPDATE_COMP;
+}
+
+static inline enum comp_state do_atomic(struct rxe_qp *qp,
+                                       struct rxe_pkt_info *pkt,
+                                       struct rxe_send_wqe *wqe)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       int ret;
+
+       u64 atomic_orig = atmack_orig(pkt);
+
+       ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE,
+                       &wqe->dma, &atomic_orig,
+                       sizeof(u64), to_mem_obj, NULL);
+       if (ret)
+               return COMPST_ERROR;
+       else
+               return COMPST_COMP_ACK;
+}
+
+static void make_send_cqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+                         struct rxe_cqe *cqe)
+{
+       memset(cqe, 0, sizeof(*cqe));
+
+       if (!qp->is_user) {
+               struct ib_wc            *wc     = &cqe->ibwc;
+
+               wc->wr_id               = wqe->wr.wr_id;
+               wc->status              = wqe->status;
+               wc->opcode              = wr_to_wc_opcode(wqe->wr.opcode);
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+                   wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+                       wc->wc_flags = IB_WC_WITH_IMM;
+               wc->byte_len            = wqe->dma.length;
+               wc->qp                  = &qp->ibqp;
+       } else {
+               struct ib_uverbs_wc     *uwc    = &cqe->uibwc;
+
+               uwc->wr_id              = wqe->wr.wr_id;
+               uwc->status             = wqe->status;
+               uwc->opcode             = wr_to_wc_opcode(wqe->wr.opcode);
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+                   wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+                       uwc->wc_flags = IB_WC_WITH_IMM;
+               uwc->byte_len           = wqe->dma.length;
+               uwc->qp_num             = qp->ibqp.qp_num;
+       }
+}
+
+static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+       struct rxe_cqe cqe;
+
+       if ((qp->sq_sig_type == IB_SIGNAL_ALL_WR) ||
+           (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+           (qp->req.state == QP_STATE_ERROR)) {
+               make_send_cqe(qp, wqe, &cqe);
+               rxe_cq_post(qp->scq, &cqe, 0);
+       }
+
+       advance_consumer(qp->sq.queue);
+
+       /*
+        * we completed something so let req run again
+        * if it is trying to fence
+        */
+       if (qp->req.wait_fence) {
+               qp->req.wait_fence = 0;
+               rxe_run_task(&qp->req.task, 1);
+       }
+}
+
+static inline enum comp_state complete_ack(struct rxe_qp *qp,
+                                          struct rxe_pkt_info *pkt,
+                                          struct rxe_send_wqe *wqe)
+{
+       unsigned long flags;
+
+       if (wqe->has_rd_atomic) {
+               wqe->has_rd_atomic = 0;
+               atomic_inc(&qp->req.rd_atomic);
+               if (qp->req.need_rd_atomic) {
+                       qp->comp.timeout_retry = 0;
+                       qp->req.need_rd_atomic = 0;
+                       rxe_run_task(&qp->req.task, 1);
+               }
+       }
+
+       if (unlikely(qp->req.state == QP_STATE_DRAIN)) {
+               /* state_lock used by requester & completer */
+               spin_lock_irqsave(&qp->state_lock, flags);
+               if ((qp->req.state == QP_STATE_DRAIN) &&
+                   (qp->comp.psn == qp->req.psn)) {
+                       qp->req.state = QP_STATE_DRAINED;
+                       spin_unlock_irqrestore(&qp->state_lock, flags);
+
+                       if (qp->ibqp.event_handler) {
+                               struct ib_event ev;
+
+                               ev.device = qp->ibqp.device;
+                               ev.element.qp = &qp->ibqp;
+                               ev.event = IB_EVENT_SQ_DRAINED;
+                               qp->ibqp.event_handler(&ev,
+                                       qp->ibqp.qp_context);
+                       }
+               } else {
+                       spin_unlock_irqrestore(&qp->state_lock, flags);
+               }
+       }
+
+       do_complete(qp, wqe);
+
+       if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+               return COMPST_UPDATE_COMP;
+       else
+               return COMPST_DONE;
+}
+
+static inline enum comp_state complete_wqe(struct rxe_qp *qp,
+                                          struct rxe_pkt_info *pkt,
+                                          struct rxe_send_wqe *wqe)
+{
+       qp->comp.opcode = -1;
+
+       if (pkt) {
+               if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+                       qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+
+               if (qp->req.wait_psn) {
+                       qp->req.wait_psn = 0;
+                       rxe_run_task(&qp->req.task, 1);
+               }
+       }
+
+       do_complete(qp, wqe);
+
+       return COMPST_GET_WQE;
+}
+
+int rxe_completer(void *arg)
+{
+       struct rxe_qp *qp = (struct rxe_qp *)arg;
+       struct rxe_send_wqe *wqe = wqe;
+       struct sk_buff *skb = NULL;
+       struct rxe_pkt_info *pkt = NULL;
+       enum comp_state state;
+
+       if (!qp->valid) {
+               while ((skb = skb_dequeue(&qp->resp_pkts))) {
+                       rxe_drop_ref(qp);
+                       kfree_skb(skb);
+               }
+               skb = NULL;
+               pkt = NULL;
+
+               while (queue_head(qp->sq.queue))
+                       advance_consumer(qp->sq.queue);
+
+               goto exit;
+       }
+
+       if (qp->req.state == QP_STATE_ERROR) {
+               while ((skb = skb_dequeue(&qp->resp_pkts))) {
+                       rxe_drop_ref(qp);
+                       kfree_skb(skb);
+               }
+               skb = NULL;
+               pkt = NULL;
+
+               while ((wqe = queue_head(qp->sq.queue))) {
+                       wqe->status = IB_WC_WR_FLUSH_ERR;
+                       do_complete(qp, wqe);
+               }
+
+               goto exit;
+       }
+
+       if (qp->req.state == QP_STATE_RESET) {
+               while ((skb = skb_dequeue(&qp->resp_pkts))) {
+                       rxe_drop_ref(qp);
+                       kfree_skb(skb);
+               }
+               skb = NULL;
+               pkt = NULL;
+
+               while (queue_head(qp->sq.queue))
+                       advance_consumer(qp->sq.queue);
+
+               goto exit;
+       }
+
+       if (qp->comp.timeout) {
+               qp->comp.timeout_retry = 1;
+               qp->comp.timeout = 0;
+       } else {
+               qp->comp.timeout_retry = 0;
+       }
+
+       if (qp->req.need_retry)
+               goto exit;
+
+       state = COMPST_GET_ACK;
+
+       while (1) {
+               pr_debug("state = %s\n", comp_state_name[state]);
+               switch (state) {
+               case COMPST_GET_ACK:
+                       skb = skb_dequeue(&qp->resp_pkts);
+                       if (skb) {
+                               pkt = SKB_TO_PKT(skb);
+                               qp->comp.timeout_retry = 0;
+                       }
+                       state = COMPST_GET_WQE;
+                       break;
+
+               case COMPST_GET_WQE:
+                       state = get_wqe(qp, pkt, &wqe);
+                       break;
+
+               case COMPST_CHECK_PSN:
+                       state = check_psn(qp, pkt, wqe);
+                       break;
+
+               case COMPST_CHECK_ACK:
+                       state = check_ack(qp, pkt, wqe);
+                       break;
+
+               case COMPST_READ:
+                       state = do_read(qp, pkt, wqe);
+                       break;
+
+               case COMPST_ATOMIC:
+                       state = do_atomic(qp, pkt, wqe);
+                       break;
+
+               case COMPST_WRITE_SEND:
+                       if (wqe->state == wqe_state_pending &&
+                           wqe->last_psn == pkt->psn)
+                               state = COMPST_COMP_ACK;
+                       else
+                               state = COMPST_UPDATE_COMP;
+                       break;
+
+               case COMPST_COMP_ACK:
+                       state = complete_ack(qp, pkt, wqe);
+                       break;
+
+               case COMPST_COMP_WQE:
+                       state = complete_wqe(qp, pkt, wqe);
+                       break;
+
+               case COMPST_UPDATE_COMP:
+                       if (pkt->mask & RXE_END_MASK)
+                               qp->comp.opcode = -1;
+                       else
+                               qp->comp.opcode = pkt->opcode;
+
+                       if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+                               qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+
+                       if (qp->req.wait_psn) {
+                               qp->req.wait_psn = 0;
+                               rxe_run_task(&qp->req.task, 1);
+                       }
+
+                       state = COMPST_DONE;
+                       break;
+
+               case COMPST_DONE:
+                       if (pkt) {
+                               rxe_drop_ref(pkt->qp);
+                               kfree_skb(skb);
+                       }
+                       goto done;
+
+               case COMPST_EXIT:
+                       if (qp->comp.timeout_retry && wqe) {
+                               state = COMPST_ERROR_RETRY;
+                               break;
+                       }
+
+                       /* re reset the timeout counter if
+                        * (1) QP is type RC
+                        * (2) the QP is alive
+                        * (3) there is a packet sent by the requester that
+                        *     might be acked (we still might get spurious
+                        *     timeouts but try to keep them as few as possible)
+                        * (4) the timeout parameter is set
+                        */
+                       if ((qp_type(qp) == IB_QPT_RC) &&
+                           (qp->req.state == QP_STATE_READY) &&
+                           (psn_compare(qp->req.psn, qp->comp.psn) > 0) &&
+                           qp->qp_timeout_jiffies)
+                               mod_timer(&qp->retrans_timer,
+                                         jiffies + qp->qp_timeout_jiffies);
+                       goto exit;
+
+               case COMPST_ERROR_RETRY:
+                       /* we come here if the retry timer fired and we did
+                        * not receive a response packet. try to retry the send
+                        * queue if that makes sense and the limits have not
+                        * been exceeded. remember that some timeouts are
+                        * spurious since we do not reset the timer but kick
+                        * it down the road or let it expire
+                        */
+
+                       /* there is nothing to retry in this case */
+                       if (!wqe || (wqe->state == wqe_state_posted))
+                               goto exit;
+
+                       if (qp->comp.retry_cnt > 0) {
+                               if (qp->comp.retry_cnt != 7)
+                                       qp->comp.retry_cnt--;
+
+                               /* no point in retrying if we have already
+                                * seen the last ack that the requester could
+                                * have caused
+                                */
+                               if (psn_compare(qp->req.psn,
+                                               qp->comp.psn) > 0) {
+                                       /* tell the requester to retry the
+                                        * send send queue next time around
+                                        */
+                                       qp->req.need_retry = 1;
+                                       rxe_run_task(&qp->req.task, 1);
+                               }
+                               goto exit;
+                       } else {
+                               wqe->status = IB_WC_RETRY_EXC_ERR;
+                               state = COMPST_ERROR;
+                       }
+                       break;
+
+               case COMPST_RNR_RETRY:
+                       if (qp->comp.rnr_retry > 0) {
+                               if (qp->comp.rnr_retry != 7)
+                                       qp->comp.rnr_retry--;
+
+                               qp->req.need_retry = 1;
+                               pr_debug("set rnr nak timer\n");
+                               mod_timer(&qp->rnr_nak_timer,
+                                         jiffies + rnrnak_jiffies(aeth_syn(pkt)
+                                               & ~AETH_TYPE_MASK));
+                               goto exit;
+                       } else {
+                               wqe->status = IB_WC_RNR_RETRY_EXC_ERR;
+                               state = COMPST_ERROR;
+                       }
+                       break;
+
+               case COMPST_ERROR:
+                       do_complete(qp, wqe);
+                       rxe_qp_error(qp);
+                       goto exit;
+               }
+       }
+
+exit:
+       /* we come here if we are done with processing and want the task to
+        * exit from the loop calling us
+        */
+       return -EAGAIN;
+
+done:
+       /* we come here if we have processed a packet we want the task to call
+        * us again to see if there is anything else to do
+        */
+       return 0;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c

new file mode 100644 (file)

index 0000000..e5e6a5e
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_cq.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
+                   int cqe, int comp_vector, struct ib_udata *udata)
+{
+       int count;
+
+       if (cqe <= 0) {
+               pr_warn("cqe(%d) <= 0\n", cqe);
+               goto err1;
+       }
+
+       if (cqe > rxe->attr.max_cqe) {
+               pr_warn("cqe(%d) > max_cqe(%d)\n",
+                       cqe, rxe->attr.max_cqe);
+               goto err1;
+       }
+
+       if (cq) {
+               count = queue_count(cq->queue);
+               if (cqe < count) {
+                       pr_warn("cqe(%d) < current # elements in queue (%d)",
+                               cqe, count);
+                       goto err1;
+               }
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static void rxe_send_complete(unsigned long data)
+{
+       struct rxe_cq *cq = (struct rxe_cq *)data;
+
+       cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
+                    int comp_vector, struct ib_ucontext *context,
+                    struct ib_udata *udata)
+{
+       int err;
+
+       cq->queue = rxe_queue_init(rxe, &cqe,
+                                  sizeof(struct rxe_cqe));
+       if (!cq->queue) {
+               pr_warn("unable to create cq\n");
+               return -ENOMEM;
+       }
+
+       err = do_mmap_info(rxe, udata, false, context, cq->queue->buf,
+                          cq->queue->buf_size, &cq->queue->ip);
+       if (err) {
+               kvfree(cq->queue->buf);
+               kfree(cq->queue);
+               return err;
+       }
+
+       if (udata)
+               cq->is_user = 1;
+
+       tasklet_init(&cq->comp_task, rxe_send_complete, (unsigned long)cq);
+
+       spin_lock_init(&cq->cq_lock);
+       cq->ibcq.cqe = cqe;
+       return 0;
+}
+
+int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe, struct ib_udata *udata)
+{
+       int err;
+
+       err = rxe_queue_resize(cq->queue, (unsigned int *)&cqe,
+                              sizeof(struct rxe_cqe),
+                              cq->queue->ip ? cq->queue->ip->context : NULL,
+                              udata, NULL, &cq->cq_lock);
+       if (!err)
+               cq->ibcq.cqe = cqe;
+
+       return err;
+}
+
+int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
+{
+       struct ib_event ev;
+       unsigned long flags;
+
+       spin_lock_irqsave(&cq->cq_lock, flags);
+
+       if (unlikely(queue_full(cq->queue))) {
+               spin_unlock_irqrestore(&cq->cq_lock, flags);
+               if (cq->ibcq.event_handler) {
+                       ev.device = cq->ibcq.device;
+                       ev.element.cq = &cq->ibcq;
+                       ev.event = IB_EVENT_CQ_ERR;
+                       cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+               }
+
+               return -EBUSY;
+       }
+
+       memcpy(producer_addr(cq->queue), cqe, sizeof(*cqe));
+
+       /* make sure all changes to the CQ are written before we update the
+        * producer pointer
+        */
+       smp_wmb();
+
+       advance_producer(cq->queue);
+       spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+       if ((cq->notify == IB_CQ_NEXT_COMP) ||
+           (cq->notify == IB_CQ_SOLICITED && solicited)) {
+               cq->notify = 0;
+               tasklet_schedule(&cq->comp_task);
+       }
+
+       return 0;
+}
+
+void rxe_cq_cleanup(void *arg)
+{
+       struct rxe_cq *cq = arg;
+
+       if (cq->queue)
+               rxe_queue_cleanup(cq->queue);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_dma.c b/drivers/infiniband/sw/rxe/rxe_dma.c

new file mode 100644 (file)

index 0000000..7634c1a
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_dma.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+#define DMA_BAD_ADDER ((u64)0)
+
+static int rxe_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+       return dma_addr == DMA_BAD_ADDER;
+}
+
+static u64 rxe_dma_map_single(struct ib_device *dev,
+                             void *cpu_addr, size_t size,
+                             enum dma_data_direction direction)
+{
+       WARN_ON(!valid_dma_direction(direction));
+       return (uintptr_t)cpu_addr;
+}
+
+static void rxe_dma_unmap_single(struct ib_device *dev,
+                                u64 addr, size_t size,
+                                enum dma_data_direction direction)
+{
+       WARN_ON(!valid_dma_direction(direction));
+}
+
+static u64 rxe_dma_map_page(struct ib_device *dev,
+                           struct page *page,
+                           unsigned long offset,
+                           size_t size, enum dma_data_direction direction)
+{
+       u64 addr;
+
+       WARN_ON(!valid_dma_direction(direction));
+
+       if (offset + size > PAGE_SIZE) {
+               addr = DMA_BAD_ADDER;
+               goto done;
+       }
+
+       addr = (uintptr_t)page_address(page);
+       if (addr)
+               addr += offset;
+
+done:
+       return addr;
+}
+
+static void rxe_dma_unmap_page(struct ib_device *dev,
+                              u64 addr, size_t size,
+                              enum dma_data_direction direction)
+{
+       WARN_ON(!valid_dma_direction(direction));
+}
+
+static int rxe_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+                     int nents, enum dma_data_direction direction)
+{
+       struct scatterlist *sg;
+       u64 addr;
+       int i;
+       int ret = nents;
+
+       WARN_ON(!valid_dma_direction(direction));
+
+       for_each_sg(sgl, sg, nents, i) {
+               addr = (uintptr_t)page_address(sg_page(sg));
+               if (!addr) {
+                       ret = 0;
+                       break;
+               }
+               sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+               sg->dma_length = sg->length;
+#endif
+       }
+
+       return ret;
+}
+
+static void rxe_unmap_sg(struct ib_device *dev,
+                        struct scatterlist *sg, int nents,
+                        enum dma_data_direction direction)
+{
+       WARN_ON(!valid_dma_direction(direction));
+}
+
+static void rxe_sync_single_for_cpu(struct ib_device *dev,
+                                   u64 addr,
+                                   size_t size, enum dma_data_direction dir)
+{
+}
+
+static void rxe_sync_single_for_device(struct ib_device *dev,
+                                      u64 addr,
+                                      size_t size, enum dma_data_direction dir)
+{
+}
+
+static void *rxe_dma_alloc_coherent(struct ib_device *dev, size_t size,
+                                   u64 *dma_handle, gfp_t flag)
+{
+       struct page *p;
+       void *addr = NULL;
+
+       p = alloc_pages(flag, get_order(size));
+       if (p)
+               addr = page_address(p);
+
+       if (dma_handle)
+               *dma_handle = (uintptr_t)addr;
+
+       return addr;
+}
+
+static void rxe_dma_free_coherent(struct ib_device *dev, size_t size,
+                                 void *cpu_addr, u64 dma_handle)
+{
+       free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops rxe_dma_mapping_ops = {
+       .mapping_error          = rxe_mapping_error,
+       .map_single             = rxe_dma_map_single,
+       .unmap_single           = rxe_dma_unmap_single,
+       .map_page               = rxe_dma_map_page,
+       .unmap_page             = rxe_dma_unmap_page,
+       .map_sg                 = rxe_map_sg,
+       .unmap_sg               = rxe_unmap_sg,
+       .sync_single_for_cpu    = rxe_sync_single_for_cpu,
+       .sync_single_for_device = rxe_sync_single_for_device,
+       .alloc_coherent         = rxe_dma_alloc_coherent,
+       .free_coherent          = rxe_dma_free_coherent
+};
diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h

new file mode 100644 (file)

index 0000000..d57b5e9
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_hdr.h
@@ -0,0 +1,952 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_HDR_H
+#define RXE_HDR_H
+
+/* extracted information about a packet carried in an sk_buff struct fits in
+ * the skbuff cb array. Must be at most 48 bytes. stored in control block of
+ * sk_buff for received packets.
+ */
+struct rxe_pkt_info {
+       struct rxe_dev          *rxe;           /* device that owns packet */
+       struct rxe_qp           *qp;            /* qp that owns packet */
+       struct rxe_send_wqe     *wqe;           /* send wqe */
+       u8                      *hdr;           /* points to bth */
+       u32                     mask;           /* useful info about pkt */
+       u32                     psn;            /* bth psn of packet */
+       u16                     pkey_index;     /* partition of pkt */
+       u16                     paylen;         /* length of bth - icrc */
+       u8                      port_num;       /* port pkt received on */
+       u8                      opcode;         /* bth opcode of packet */
+       u8                      offset;         /* bth offset from pkt->hdr */
+};
+
+/* Macros should be used only for received skb */
+#define SKB_TO_PKT(skb) ((struct rxe_pkt_info *)(skb)->cb)
+#define PKT_TO_SKB(pkt) container_of((void *)(pkt), struct sk_buff, cb)
+
+/*
+ * IBA header types and methods
+ *
+ * Some of these are for reference and completeness only since
+ * rxe does not currently support RD transport
+ * most of this could be moved into IB core. ib_pack.h has
+ * part of this but is incomplete
+ *
+ * Header specific routines to insert/extract values to/from headers
+ * the routines that are named __hhh_(set_)fff() take a pointer to a
+ * hhh header and get(set) the fff field. The routines named
+ * hhh_(set_)fff take a packet info struct and find the
+ * header and field based on the opcode in the packet.
+ * Conversion to/from network byte order from cpu order is also done.
+ */
+
+#define RXE_ICRC_SIZE          (4)
+#define RXE_MAX_HDR_LENGTH     (80)
+
+/******************************************************************************
+ * Base Transport Header
+ ******************************************************************************/
+struct rxe_bth {
+       u8                      opcode;
+       u8                      flags;
+       __be16                  pkey;
+       __be32                  qpn;
+       __be32                  apsn;
+};
+
+#define BTH_TVER               (0)
+#define BTH_DEF_PKEY           (0xffff)
+
+#define BTH_SE_MASK            (0x80)
+#define BTH_MIG_MASK           (0x40)
+#define BTH_PAD_MASK           (0x30)
+#define BTH_TVER_MASK          (0x0f)
+#define BTH_FECN_MASK          (0x80000000)
+#define BTH_BECN_MASK          (0x40000000)
+#define BTH_RESV6A_MASK                (0x3f000000)
+#define BTH_QPN_MASK           (0x00ffffff)
+#define BTH_ACK_MASK           (0x80000000)
+#define BTH_RESV7_MASK         (0x7f000000)
+#define BTH_PSN_MASK           (0x00ffffff)
+
+static inline u8 __bth_opcode(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return bth->opcode;
+}
+
+static inline void __bth_set_opcode(void *arg, u8 opcode)
+{
+       struct rxe_bth *bth = arg;
+
+       bth->opcode = opcode;
+}
+
+static inline u8 __bth_se(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return 0 != (BTH_SE_MASK & bth->flags);
+}
+
+static inline void __bth_set_se(void *arg, int se)
+{
+       struct rxe_bth *bth = arg;
+
+       if (se)
+               bth->flags |= BTH_SE_MASK;
+       else
+               bth->flags &= ~BTH_SE_MASK;
+}
+
+static inline u8 __bth_mig(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return 0 != (BTH_MIG_MASK & bth->flags);
+}
+
+static inline void __bth_set_mig(void *arg, u8 mig)
+{
+       struct rxe_bth *bth = arg;
+
+       if (mig)
+               bth->flags |= BTH_MIG_MASK;
+       else
+               bth->flags &= ~BTH_MIG_MASK;
+}
+
+static inline u8 __bth_pad(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return (BTH_PAD_MASK & bth->flags) >> 4;
+}
+
+static inline void __bth_set_pad(void *arg, u8 pad)
+{
+       struct rxe_bth *bth = arg;
+
+       bth->flags = (BTH_PAD_MASK & (pad << 4)) |
+                       (~BTH_PAD_MASK & bth->flags);
+}
+
+static inline u8 __bth_tver(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return BTH_TVER_MASK & bth->flags;
+}
+
+static inline void __bth_set_tver(void *arg, u8 tver)
+{
+       struct rxe_bth *bth = arg;
+
+       bth->flags = (BTH_TVER_MASK & tver) |
+                       (~BTH_TVER_MASK & bth->flags);
+}
+
+static inline u16 __bth_pkey(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return be16_to_cpu(bth->pkey);
+}
+
+static inline void __bth_set_pkey(void *arg, u16 pkey)
+{
+       struct rxe_bth *bth = arg;
+
+       bth->pkey = cpu_to_be16(pkey);
+}
+
+static inline u32 __bth_qpn(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return BTH_QPN_MASK & be32_to_cpu(bth->qpn);
+}
+
+static inline void __bth_set_qpn(void *arg, u32 qpn)
+{
+       struct rxe_bth *bth = arg;
+       u32 resvqpn = be32_to_cpu(bth->qpn);
+
+       bth->qpn = cpu_to_be32((BTH_QPN_MASK & qpn) |
+                              (~BTH_QPN_MASK & resvqpn));
+}
+
+static inline int __bth_fecn(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return 0 != (cpu_to_be32(BTH_FECN_MASK) & bth->qpn);
+}
+
+static inline void __bth_set_fecn(void *arg, int fecn)
+{
+       struct rxe_bth *bth = arg;
+
+       if (fecn)
+               bth->qpn |= cpu_to_be32(BTH_FECN_MASK);
+       else
+               bth->qpn &= ~cpu_to_be32(BTH_FECN_MASK);
+}
+
+static inline int __bth_becn(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return 0 != (cpu_to_be32(BTH_BECN_MASK) & bth->qpn);
+}
+
+static inline void __bth_set_becn(void *arg, int becn)
+{
+       struct rxe_bth *bth = arg;
+
+       if (becn)
+               bth->qpn |= cpu_to_be32(BTH_BECN_MASK);
+       else
+               bth->qpn &= ~cpu_to_be32(BTH_BECN_MASK);
+}
+
+static inline u8 __bth_resv6a(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return (BTH_RESV6A_MASK & be32_to_cpu(bth->qpn)) >> 24;
+}
+
+static inline void __bth_set_resv6a(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK);
+}
+
+static inline int __bth_ack(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return 0 != (cpu_to_be32(BTH_ACK_MASK) & bth->apsn);
+}
+
+static inline void __bth_set_ack(void *arg, int ack)
+{
+       struct rxe_bth *bth = arg;
+
+       if (ack)
+               bth->apsn |= cpu_to_be32(BTH_ACK_MASK);
+       else
+               bth->apsn &= ~cpu_to_be32(BTH_ACK_MASK);
+}
+
+static inline void __bth_set_resv7(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       bth->apsn &= ~cpu_to_be32(BTH_RESV7_MASK);
+}
+
+static inline u32 __bth_psn(void *arg)
+{
+       struct rxe_bth *bth = arg;
+
+       return BTH_PSN_MASK & be32_to_cpu(bth->apsn);
+}
+
+static inline void __bth_set_psn(void *arg, u32 psn)
+{
+       struct rxe_bth *bth = arg;
+       u32 apsn = be32_to_cpu(bth->apsn);
+
+       bth->apsn = cpu_to_be32((BTH_PSN_MASK & psn) |
+                       (~BTH_PSN_MASK & apsn));
+}
+
+static inline u8 bth_opcode(struct rxe_pkt_info *pkt)
+{
+       return __bth_opcode(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_opcode(struct rxe_pkt_info *pkt, u8 opcode)
+{
+       __bth_set_opcode(pkt->hdr + pkt->offset, opcode);
+}
+
+static inline u8 bth_se(struct rxe_pkt_info *pkt)
+{
+       return __bth_se(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_se(struct rxe_pkt_info *pkt, int se)
+{
+       __bth_set_se(pkt->hdr + pkt->offset, se);
+}
+
+static inline u8 bth_mig(struct rxe_pkt_info *pkt)
+{
+       return __bth_mig(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_mig(struct rxe_pkt_info *pkt, u8 mig)
+{
+       __bth_set_mig(pkt->hdr + pkt->offset, mig);
+}
+
+static inline u8 bth_pad(struct rxe_pkt_info *pkt)
+{
+       return __bth_pad(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_pad(struct rxe_pkt_info *pkt, u8 pad)
+{
+       __bth_set_pad(pkt->hdr + pkt->offset, pad);
+}
+
+static inline u8 bth_tver(struct rxe_pkt_info *pkt)
+{
+       return __bth_tver(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_tver(struct rxe_pkt_info *pkt, u8 tver)
+{
+       __bth_set_tver(pkt->hdr + pkt->offset, tver);
+}
+
+static inline u16 bth_pkey(struct rxe_pkt_info *pkt)
+{
+       return __bth_pkey(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_pkey(struct rxe_pkt_info *pkt, u16 pkey)
+{
+       __bth_set_pkey(pkt->hdr + pkt->offset, pkey);
+}
+
+static inline u32 bth_qpn(struct rxe_pkt_info *pkt)
+{
+       return __bth_qpn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_qpn(struct rxe_pkt_info *pkt, u32 qpn)
+{
+       __bth_set_qpn(pkt->hdr + pkt->offset, qpn);
+}
+
+static inline int bth_fecn(struct rxe_pkt_info *pkt)
+{
+       return __bth_fecn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_fecn(struct rxe_pkt_info *pkt, int fecn)
+{
+       __bth_set_fecn(pkt->hdr + pkt->offset, fecn);
+}
+
+static inline int bth_becn(struct rxe_pkt_info *pkt)
+{
+       return __bth_becn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_becn(struct rxe_pkt_info *pkt, int becn)
+{
+       __bth_set_becn(pkt->hdr + pkt->offset, becn);
+}
+
+static inline u8 bth_resv6a(struct rxe_pkt_info *pkt)
+{
+       return __bth_resv6a(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_resv6a(struct rxe_pkt_info *pkt)
+{
+       __bth_set_resv6a(pkt->hdr + pkt->offset);
+}
+
+static inline int bth_ack(struct rxe_pkt_info *pkt)
+{
+       return __bth_ack(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_ack(struct rxe_pkt_info *pkt, int ack)
+{
+       __bth_set_ack(pkt->hdr + pkt->offset, ack);
+}
+
+static inline void bth_set_resv7(struct rxe_pkt_info *pkt)
+{
+       __bth_set_resv7(pkt->hdr + pkt->offset);
+}
+
+static inline u32 bth_psn(struct rxe_pkt_info *pkt)
+{
+       return __bth_psn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_psn(struct rxe_pkt_info *pkt, u32 psn)
+{
+       __bth_set_psn(pkt->hdr + pkt->offset, psn);
+}
+
+static inline void bth_init(struct rxe_pkt_info *pkt, u8 opcode, int se,
+                           int mig, int pad, u16 pkey, u32 qpn, int ack_req,
+                           u32 psn)
+{
+       struct rxe_bth *bth = (struct rxe_bth *)(pkt->hdr + pkt->offset);
+
+       bth->opcode = opcode;
+       bth->flags = (pad << 4) & BTH_PAD_MASK;
+       if (se)
+               bth->flags |= BTH_SE_MASK;
+       if (mig)
+               bth->flags |= BTH_MIG_MASK;
+       bth->pkey = cpu_to_be16(pkey);
+       bth->qpn = cpu_to_be32(qpn & BTH_QPN_MASK);
+       psn &= BTH_PSN_MASK;
+       if (ack_req)
+               psn |= BTH_ACK_MASK;
+       bth->apsn = cpu_to_be32(psn);
+}
+
+/******************************************************************************
+ * Reliable Datagram Extended Transport Header
+ ******************************************************************************/
+struct rxe_rdeth {
+       __be32                  een;
+};
+
+#define RDETH_EEN_MASK         (0x00ffffff)
+
+static inline u8 __rdeth_een(void *arg)
+{
+       struct rxe_rdeth *rdeth = arg;
+
+       return RDETH_EEN_MASK & be32_to_cpu(rdeth->een);
+}
+
+static inline void __rdeth_set_een(void *arg, u32 een)
+{
+       struct rxe_rdeth *rdeth = arg;
+
+       rdeth->een = cpu_to_be32(RDETH_EEN_MASK & een);
+}
+
+static inline u8 rdeth_een(struct rxe_pkt_info *pkt)
+{
+       return __rdeth_een(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RDETH]);
+}
+
+static inline void rdeth_set_een(struct rxe_pkt_info *pkt, u32 een)
+{
+       __rdeth_set_een(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RDETH], een);
+}
+
+/******************************************************************************
+ * Datagram Extended Transport Header
+ ******************************************************************************/
+struct rxe_deth {
+       __be32                  qkey;
+       __be32                  sqp;
+};
+
+#define GSI_QKEY               (0x80010000)
+#define DETH_SQP_MASK          (0x00ffffff)
+
+static inline u32 __deth_qkey(void *arg)
+{
+       struct rxe_deth *deth = arg;
+
+       return be32_to_cpu(deth->qkey);
+}
+
+static inline void __deth_set_qkey(void *arg, u32 qkey)
+{
+       struct rxe_deth *deth = arg;
+
+       deth->qkey = cpu_to_be32(qkey);
+}
+
+static inline u32 __deth_sqp(void *arg)
+{
+       struct rxe_deth *deth = arg;
+
+       return DETH_SQP_MASK & be32_to_cpu(deth->sqp);
+}
+
+static inline void __deth_set_sqp(void *arg, u32 sqp)
+{
+       struct rxe_deth *deth = arg;
+
+       deth->sqp = cpu_to_be32(DETH_SQP_MASK & sqp);
+}
+
+static inline u32 deth_qkey(struct rxe_pkt_info *pkt)
+{
+       return __deth_qkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_DETH]);
+}
+
+static inline void deth_set_qkey(struct rxe_pkt_info *pkt, u32 qkey)
+{
+       __deth_set_qkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_DETH], qkey);
+}
+
+static inline u32 deth_sqp(struct rxe_pkt_info *pkt)
+{
+       return __deth_sqp(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_DETH]);
+}
+
+static inline void deth_set_sqp(struct rxe_pkt_info *pkt, u32 sqp)
+{
+       __deth_set_sqp(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_DETH], sqp);
+}
+
+/******************************************************************************
+ * RDMA Extended Transport Header
+ ******************************************************************************/
+struct rxe_reth {
+       __be64                  va;
+       __be32                  rkey;
+       __be32                  len;
+};
+
+static inline u64 __reth_va(void *arg)
+{
+       struct rxe_reth *reth = arg;
+
+       return be64_to_cpu(reth->va);
+}
+
+static inline void __reth_set_va(void *arg, u64 va)
+{
+       struct rxe_reth *reth = arg;
+
+       reth->va = cpu_to_be64(va);
+}
+
+static inline u32 __reth_rkey(void *arg)
+{
+       struct rxe_reth *reth = arg;
+
+       return be32_to_cpu(reth->rkey);
+}
+
+static inline void __reth_set_rkey(void *arg, u32 rkey)
+{
+       struct rxe_reth *reth = arg;
+
+       reth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u32 __reth_len(void *arg)
+{
+       struct rxe_reth *reth = arg;
+
+       return be32_to_cpu(reth->len);
+}
+
+static inline void __reth_set_len(void *arg, u32 len)
+{
+       struct rxe_reth *reth = arg;
+
+       reth->len = cpu_to_be32(len);
+}
+
+static inline u64 reth_va(struct rxe_pkt_info *pkt)
+{
+       return __reth_va(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_va(struct rxe_pkt_info *pkt, u64 va)
+{
+       __reth_set_va(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RETH], va);
+}
+
+static inline u32 reth_rkey(struct rxe_pkt_info *pkt)
+{
+       return __reth_rkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+       __reth_set_rkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RETH], rkey);
+}
+
+static inline u32 reth_len(struct rxe_pkt_info *pkt)
+{
+       return __reth_len(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_len(struct rxe_pkt_info *pkt, u32 len)
+{
+       __reth_set_len(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_RETH], len);
+}
+
+/******************************************************************************
+ * Atomic Extended Transport Header
+ ******************************************************************************/
+struct rxe_atmeth {
+       __be64                  va;
+       __be32                  rkey;
+       __be64                  swap_add;
+       __be64                  comp;
+} __attribute__((__packed__));
+
+static inline u64 __atmeth_va(void *arg)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       return be64_to_cpu(atmeth->va);
+}
+
+static inline void __atmeth_set_va(void *arg, u64 va)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       atmeth->va = cpu_to_be64(va);
+}
+
+static inline u32 __atmeth_rkey(void *arg)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       return be32_to_cpu(atmeth->rkey);
+}
+
+static inline void __atmeth_set_rkey(void *arg, u32 rkey)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       atmeth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u64 __atmeth_swap_add(void *arg)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       return be64_to_cpu(atmeth->swap_add);
+}
+
+static inline void __atmeth_set_swap_add(void *arg, u64 swap_add)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       atmeth->swap_add = cpu_to_be64(swap_add);
+}
+
+static inline u64 __atmeth_comp(void *arg)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       return be64_to_cpu(atmeth->comp);
+}
+
+static inline void __atmeth_set_comp(void *arg, u64 comp)
+{
+       struct rxe_atmeth *atmeth = arg;
+
+       atmeth->comp = cpu_to_be64(comp);
+}
+
+static inline u64 atmeth_va(struct rxe_pkt_info *pkt)
+{
+       return __atmeth_va(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_va(struct rxe_pkt_info *pkt, u64 va)
+{
+       __atmeth_set_va(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], va);
+}
+
+static inline u32 atmeth_rkey(struct rxe_pkt_info *pkt)
+{
+       return __atmeth_rkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+       __atmeth_set_rkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], rkey);
+}
+
+static inline u64 atmeth_swap_add(struct rxe_pkt_info *pkt)
+{
+       return __atmeth_swap_add(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_swap_add(struct rxe_pkt_info *pkt, u64 swap_add)
+{
+       __atmeth_set_swap_add(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], swap_add);
+}
+
+static inline u64 atmeth_comp(struct rxe_pkt_info *pkt)
+{
+       return __atmeth_comp(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_comp(struct rxe_pkt_info *pkt, u64 comp)
+{
+       __atmeth_set_comp(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], comp);
+}
+
+/******************************************************************************
+ * Ack Extended Transport Header
+ ******************************************************************************/
+struct rxe_aeth {
+       __be32                  smsn;
+};
+
+#define AETH_SYN_MASK          (0xff000000)
+#define AETH_MSN_MASK          (0x00ffffff)
+
+enum aeth_syndrome {
+       AETH_TYPE_MASK          = 0xe0,
+       AETH_ACK                = 0x00,
+       AETH_RNR_NAK            = 0x20,
+       AETH_RSVD               = 0x40,
+       AETH_NAK                = 0x60,
+       AETH_ACK_UNLIMITED      = 0x1f,
+       AETH_NAK_PSN_SEQ_ERROR  = 0x60,
+       AETH_NAK_INVALID_REQ    = 0x61,
+       AETH_NAK_REM_ACC_ERR    = 0x62,
+       AETH_NAK_REM_OP_ERR     = 0x63,
+       AETH_NAK_INV_RD_REQ     = 0x64,
+};
+
+static inline u8 __aeth_syn(void *arg)
+{
+       struct rxe_aeth *aeth = arg;
+
+       return (AETH_SYN_MASK & be32_to_cpu(aeth->smsn)) >> 24;
+}
+
+static inline void __aeth_set_syn(void *arg, u8 syn)
+{
+       struct rxe_aeth *aeth = arg;
+       u32 smsn = be32_to_cpu(aeth->smsn);
+
+       aeth->smsn = cpu_to_be32((AETH_SYN_MASK & (syn << 24)) |
+                        (~AETH_SYN_MASK & smsn));
+}
+
+static inline u32 __aeth_msn(void *arg)
+{
+       struct rxe_aeth *aeth = arg;
+
+       return AETH_MSN_MASK & be32_to_cpu(aeth->smsn);
+}
+
+static inline void __aeth_set_msn(void *arg, u32 msn)
+{
+       struct rxe_aeth *aeth = arg;
+       u32 smsn = be32_to_cpu(aeth->smsn);
+
+       aeth->smsn = cpu_to_be32((AETH_MSN_MASK & msn) |
+                        (~AETH_MSN_MASK & smsn));
+}
+
+static inline u8 aeth_syn(struct rxe_pkt_info *pkt)
+{
+       return __aeth_syn(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_AETH]);
+}
+
+static inline void aeth_set_syn(struct rxe_pkt_info *pkt, u8 syn)
+{
+       __aeth_set_syn(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_AETH], syn);
+}
+
+static inline u32 aeth_msn(struct rxe_pkt_info *pkt)
+{
+       return __aeth_msn(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_AETH]);
+}
+
+static inline void aeth_set_msn(struct rxe_pkt_info *pkt, u32 msn)
+{
+       __aeth_set_msn(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_AETH], msn);
+}
+
+/******************************************************************************
+ * Atomic Ack Extended Transport Header
+ ******************************************************************************/
+struct rxe_atmack {
+       __be64                  orig;
+};
+
+static inline u64 __atmack_orig(void *arg)
+{
+       struct rxe_atmack *atmack = arg;
+
+       return be64_to_cpu(atmack->orig);
+}
+
+static inline void __atmack_set_orig(void *arg, u64 orig)
+{
+       struct rxe_atmack *atmack = arg;
+
+       atmack->orig = cpu_to_be64(orig);
+}
+
+static inline u64 atmack_orig(struct rxe_pkt_info *pkt)
+{
+       return __atmack_orig(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMACK]);
+}
+
+static inline void atmack_set_orig(struct rxe_pkt_info *pkt, u64 orig)
+{
+       __atmack_set_orig(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_ATMACK], orig);
+}
+
+/******************************************************************************
+ * Immediate Extended Transport Header
+ ******************************************************************************/
+struct rxe_immdt {
+       __be32                  imm;
+};
+
+static inline __be32 __immdt_imm(void *arg)
+{
+       struct rxe_immdt *immdt = arg;
+
+       return immdt->imm;
+}
+
+static inline void __immdt_set_imm(void *arg, __be32 imm)
+{
+       struct rxe_immdt *immdt = arg;
+
+       immdt->imm = imm;
+}
+
+static inline __be32 immdt_imm(struct rxe_pkt_info *pkt)
+{
+       return __immdt_imm(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_IMMDT]);
+}
+
+static inline void immdt_set_imm(struct rxe_pkt_info *pkt, __be32 imm)
+{
+       __immdt_set_imm(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_IMMDT], imm);
+}
+
+/******************************************************************************
+ * Invalidate Extended Transport Header
+ ******************************************************************************/
+struct rxe_ieth {
+       __be32                  rkey;
+};
+
+static inline u32 __ieth_rkey(void *arg)
+{
+       struct rxe_ieth *ieth = arg;
+
+       return be32_to_cpu(ieth->rkey);
+}
+
+static inline void __ieth_set_rkey(void *arg, u32 rkey)
+{
+       struct rxe_ieth *ieth = arg;
+
+       ieth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u32 ieth_rkey(struct rxe_pkt_info *pkt)
+{
+       return __ieth_rkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_IETH]);
+}
+
+static inline void ieth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+       __ieth_set_rkey(pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_IETH], rkey);
+}
+
+enum rxe_hdr_length {
+       RXE_BTH_BYTES           = sizeof(struct rxe_bth),
+       RXE_DETH_BYTES          = sizeof(struct rxe_deth),
+       RXE_IMMDT_BYTES         = sizeof(struct rxe_immdt),
+       RXE_RETH_BYTES          = sizeof(struct rxe_reth),
+       RXE_AETH_BYTES          = sizeof(struct rxe_aeth),
+       RXE_ATMACK_BYTES        = sizeof(struct rxe_atmack),
+       RXE_ATMETH_BYTES        = sizeof(struct rxe_atmeth),
+       RXE_IETH_BYTES          = sizeof(struct rxe_ieth),
+       RXE_RDETH_BYTES         = sizeof(struct rxe_rdeth),
+};
+
+static inline size_t header_size(struct rxe_pkt_info *pkt)
+{
+       return pkt->offset + rxe_opcode[pkt->opcode].length;
+}
+
+static inline void *payload_addr(struct rxe_pkt_info *pkt)
+{
+       return pkt->hdr + pkt->offset
+               + rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD];
+}
+
+static inline size_t payload_size(struct rxe_pkt_info *pkt)
+{
+       return pkt->paylen - rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD]
+               - bth_pad(pkt) - RXE_ICRC_SIZE;
+}
+
+#endif /* RXE_HDR_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c

new file mode 100644 (file)

index 0000000..413b56b
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_icrc.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/* Compute a partial ICRC for all the IB transport headers. */
+u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb)
+{
+       unsigned int bth_offset = 0;
+       struct iphdr *ip4h = NULL;
+       struct ipv6hdr *ip6h = NULL;
+       struct udphdr *udph;
+       struct rxe_bth *bth;
+       int crc;
+       int length;
+       int hdr_size = sizeof(struct udphdr) +
+               (skb->protocol == htons(ETH_P_IP) ?
+               sizeof(struct iphdr) : sizeof(struct ipv6hdr));
+       /* pseudo header buffer size is calculate using ipv6 header size since
+        * it is bigger than ipv4
+        */
+       u8 pshdr[sizeof(struct udphdr) +
+               sizeof(struct ipv6hdr) +
+               RXE_BTH_BYTES];
+
+       /* This seed is the result of computing a CRC with a seed of
+        * 0xfffffff and 8 bytes of 0xff representing a masked LRH.
+        */
+       crc = 0xdebb20e3;
+
+       if (skb->protocol == htons(ETH_P_IP)) { /* IPv4 */
+               memcpy(pshdr, ip_hdr(skb), hdr_size);
+               ip4h = (struct iphdr *)pshdr;
+               udph = (struct udphdr *)(ip4h + 1);
+
+               ip4h->ttl = 0xff;
+               ip4h->check = CSUM_MANGLED_0;
+               ip4h->tos = 0xff;
+       } else {                                /* IPv6 */
+               memcpy(pshdr, ipv6_hdr(skb), hdr_size);
+               ip6h = (struct ipv6hdr *)pshdr;
+               udph = (struct udphdr *)(ip6h + 1);
+
+               memset(ip6h->flow_lbl, 0xff, sizeof(ip6h->flow_lbl));
+               ip6h->priority = 0xf;
+               ip6h->hop_limit = 0xff;
+       }
+       udph->check = CSUM_MANGLED_0;
+
+       bth_offset += hdr_size;
+
+       memcpy(&pshdr[bth_offset], pkt->hdr, RXE_BTH_BYTES);
+       bth = (struct rxe_bth *)&pshdr[bth_offset];
+
+       /* exclude bth.resv8a */
+       bth->qpn |= cpu_to_be32(~BTH_QPN_MASK);
+
+       length = hdr_size + RXE_BTH_BYTES;
+       crc = crc32_le(crc, pshdr, length);
+
+       /* And finish to compute the CRC on the remainder of the headers. */
+       crc = crc32_le(crc, pkt->hdr + RXE_BTH_BYTES,
+                      rxe_opcode[pkt->opcode].length - RXE_BTH_BYTES);
+       return crc;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h

new file mode 100644 (file)

index 0000000..4a5484e
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_LOC_H
+#define RXE_LOC_H
+
+/* rxe_av.c */
+
+int rxe_av_chk_attr(struct rxe_dev *rxe, struct ib_ah_attr *attr);
+
+int rxe_av_from_attr(struct rxe_dev *rxe, u8 port_num,
+                    struct rxe_av *av, struct ib_ah_attr *attr);
+
+int rxe_av_to_attr(struct rxe_dev *rxe, struct rxe_av *av,
+                  struct ib_ah_attr *attr);
+
+int rxe_av_fill_ip_info(struct rxe_dev *rxe,
+                       struct rxe_av *av,
+                       struct ib_ah_attr *attr,
+                       struct ib_gid_attr *sgid_attr,
+                       union ib_gid *sgid);
+
+struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt);
+
+/* rxe_cq.c */
+int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
+                   int cqe, int comp_vector, struct ib_udata *udata);
+
+int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
+                    int comp_vector, struct ib_ucontext *context,
+                    struct ib_udata *udata);
+
+int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, struct ib_udata *udata);
+
+int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited);
+
+void rxe_cq_cleanup(void *arg);
+
+/* rxe_mcast.c */
+int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
+                     struct rxe_mc_grp **grp_p);
+
+int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+                          struct rxe_mc_grp *grp);
+
+int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+                           union ib_gid *mgid);
+
+void rxe_drop_all_mcast_groups(struct rxe_qp *qp);
+
+void rxe_mc_cleanup(void *arg);
+
+/* rxe_mmap.c */
+struct rxe_mmap_info {
+       struct list_head        pending_mmaps;
+       struct ib_ucontext      *context;
+       struct kref             ref;
+       void                    *obj;
+
+       struct mminfo info;
+};
+
+void rxe_mmap_release(struct kref *ref);
+
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev,
+                                          u32 size,
+                                          struct ib_ucontext *context,
+                                          void *obj);
+
+int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+/* rxe_mr.c */
+enum copy_direction {
+       to_mem_obj,
+       from_mem_obj,
+};
+
+int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd,
+                    int access, struct rxe_mem *mem);
+
+int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start,
+                     u64 length, u64 iova, int access, struct ib_udata *udata,
+                     struct rxe_mem *mr);
+
+int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd,
+                     int max_pages, struct rxe_mem *mem);
+
+int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr,
+                int length, enum copy_direction dir, u32 *crcp);
+
+int copy_data(struct rxe_dev *rxe, struct rxe_pd *pd, int access,
+             struct rxe_dma_info *dma, void *addr, int length,
+             enum copy_direction dir, u32 *crcp);
+
+void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length);
+
+enum lookup_type {
+       lookup_local,
+       lookup_remote,
+};
+
+struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key,
+                          enum lookup_type type);
+
+int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length);
+
+int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem,
+                     u64 *page, int num_pages, u64 iova);
+
+void rxe_mem_cleanup(void *arg);
+
+int advance_dma_data(struct rxe_dma_info *dma, unsigned int length);
+
+/* rxe_qp.c */
+int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init);
+
+int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
+                    struct ib_qp_init_attr *init, struct ib_udata *udata,
+                    struct ib_pd *ibpd);
+
+int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init);
+
+int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
+                   struct ib_qp_attr *attr, int mask);
+
+int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr,
+                    int mask, struct ib_udata *udata);
+
+int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask);
+
+void rxe_qp_error(struct rxe_qp *qp);
+
+void rxe_qp_destroy(struct rxe_qp *qp);
+
+void rxe_qp_cleanup(void *arg);
+
+static inline int qp_num(struct rxe_qp *qp)
+{
+       return qp->ibqp.qp_num;
+}
+
+static inline enum ib_qp_type qp_type(struct rxe_qp *qp)
+{
+       return qp->ibqp.qp_type;
+}
+
+static inline enum ib_qp_state qp_state(struct rxe_qp *qp)
+{
+       return qp->attr.qp_state;
+}
+
+static inline int qp_mtu(struct rxe_qp *qp)
+{
+       if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC)
+               return qp->attr.path_mtu;
+       else
+               return RXE_PORT_MAX_MTU;
+}
+
+static inline int rcv_wqe_size(int max_sge)
+{
+       return sizeof(struct rxe_recv_wqe) +
+               max_sge * sizeof(struct ib_sge);
+}
+
+void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res);
+
+static inline void rxe_advance_resp_resource(struct rxe_qp *qp)
+{
+       qp->resp.res_head++;
+       if (unlikely(qp->resp.res_head == qp->attr.max_rd_atomic))
+               qp->resp.res_head = 0;
+}
+
+void retransmit_timer(unsigned long data);
+void rnr_nak_timer(unsigned long data);
+
+void dump_qp(struct rxe_qp *qp);
+
+/* rxe_srq.c */
+#define IB_SRQ_INIT_MASK (~IB_SRQ_LIMIT)
+
+int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+                    struct ib_srq_attr *attr, enum ib_srq_attr_mask mask);
+
+int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
+                     struct ib_srq_init_attr *init,
+                     struct ib_ucontext *context, struct ib_udata *udata);
+
+int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+                     struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
+                     struct ib_udata *udata);
+
+extern struct ib_dma_mapping_ops rxe_dma_mapping_ops;
+
+void rxe_release(struct kref *kref);
+
+int rxe_completer(void *arg);
+int rxe_requester(void *arg);
+int rxe_responder(void *arg);
+
+u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb);
+
+void rxe_resp_queue_pkt(struct rxe_dev *rxe,
+                       struct rxe_qp *qp, struct sk_buff *skb);
+
+void rxe_comp_queue_pkt(struct rxe_dev *rxe,
+                       struct rxe_qp *qp, struct sk_buff *skb);
+
+static inline unsigned wr_opcode_mask(int opcode, struct rxe_qp *qp)
+{
+       return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type];
+}
+
+static inline int rxe_xmit_packet(struct rxe_dev *rxe, struct rxe_qp *qp,
+                                 struct rxe_pkt_info *pkt, struct sk_buff *skb)
+{
+       int err;
+       int is_request = pkt->mask & RXE_REQ_MASK;
+
+       if ((is_request && (qp->req.state != QP_STATE_READY)) ||
+           (!is_request && (qp->resp.state != QP_STATE_READY))) {
+               pr_info("Packet dropped. QP is not in ready state\n");
+               goto drop;
+       }
+
+       if (pkt->mask & RXE_LOOPBACK_MASK) {
+               memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt));
+               err = rxe->ifc_ops->loopback(skb);
+       } else {
+               err = rxe->ifc_ops->send(rxe, pkt, skb);
+       }
+
+       if (err) {
+               rxe->xmit_errors++;
+               return err;
+       }
+
+       atomic_inc(&qp->skb_out);
+
+       if ((qp_type(qp) != IB_QPT_RC) &&
+           (pkt->mask & RXE_END_MASK)) {
+               pkt->wqe->state = wqe_state_done;
+               rxe_run_task(&qp->comp.task, 1);
+       }
+
+       goto done;
+
+drop:
+       kfree_skb(skb);
+       err = 0;
+done:
+       return err;
+}
+
+#endif /* RXE_LOC_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c

new file mode 100644 (file)

index 0000000..fa95544
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *             - Redistributions of source code must retain the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer.
+ *
+ *             - Redistributions in binary form must reproduce the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer in the documentation and/or other materials
+ *               provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
+                     struct rxe_mc_grp **grp_p)
+{
+       int err;
+       struct rxe_mc_grp *grp;
+
+       if (rxe->attr.max_mcast_qp_attach == 0) {
+               err = -EINVAL;
+               goto err1;
+       }
+
+       grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid);
+       if (grp)
+               goto done;
+
+       grp = rxe_alloc(&rxe->mc_grp_pool);
+       if (!grp) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       INIT_LIST_HEAD(&grp->qp_list);
+       spin_lock_init(&grp->mcg_lock);
+       grp->rxe = rxe;
+
+       rxe_add_key(grp, mgid);
+
+       err = rxe->ifc_ops->mcast_add(rxe, mgid);
+       if (err)
+               goto err2;
+
+done:
+       *grp_p = grp;
+       return 0;
+
+err2:
+       rxe_drop_ref(grp);
+err1:
+       return err;
+}
+
+int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+                          struct rxe_mc_grp *grp)
+{
+       int err;
+       struct rxe_mc_elem *elem;
+
+       /* check to see of the qp is already a member of the group */
+       spin_lock_bh(&qp->grp_lock);
+       spin_lock_bh(&grp->mcg_lock);
+       list_for_each_entry(elem, &grp->qp_list, qp_list) {
+               if (elem->qp == qp) {
+                       err = 0;
+                       goto out;
+               }
+       }
+
+       if (grp->num_qp >= rxe->attr.max_mcast_qp_attach) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       elem = rxe_alloc(&rxe->mc_elem_pool);
+       if (!elem) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       /* each qp holds a ref on the grp */
+       rxe_add_ref(grp);
+
+       grp->num_qp++;
+       elem->qp = qp;
+       elem->grp = grp;
+
+       list_add(&elem->qp_list, &grp->qp_list);
+       list_add(&elem->grp_list, &qp->grp_list);
+
+       err = 0;
+out:
+       spin_unlock_bh(&grp->mcg_lock);
+       spin_unlock_bh(&qp->grp_lock);
+       return err;
+}
+
+int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+                           union ib_gid *mgid)
+{
+       struct rxe_mc_grp *grp;
+       struct rxe_mc_elem *elem, *tmp;
+
+       grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid);
+       if (!grp)
+               goto err1;
+
+       spin_lock_bh(&qp->grp_lock);
+       spin_lock_bh(&grp->mcg_lock);
+
+       list_for_each_entry_safe(elem, tmp, &grp->qp_list, qp_list) {
+               if (elem->qp == qp) {
+                       list_del(&elem->qp_list);
+                       list_del(&elem->grp_list);
+                       grp->num_qp--;
+
+                       spin_unlock_bh(&grp->mcg_lock);
+                       spin_unlock_bh(&qp->grp_lock);
+                       rxe_drop_ref(elem);
+                       rxe_drop_ref(grp);      /* ref held by QP */
+                       rxe_drop_ref(grp);      /* ref from get_key */
+                       return 0;
+               }
+       }
+
+       spin_unlock_bh(&grp->mcg_lock);
+       spin_unlock_bh(&qp->grp_lock);
+       rxe_drop_ref(grp);                      /* ref from get_key */
+err1:
+       return -EINVAL;
+}
+
+void rxe_drop_all_mcast_groups(struct rxe_qp *qp)
+{
+       struct rxe_mc_grp *grp;
+       struct rxe_mc_elem *elem;
+
+       while (1) {
+               spin_lock_bh(&qp->grp_lock);
+               if (list_empty(&qp->grp_list)) {
+                       spin_unlock_bh(&qp->grp_lock);
+                       break;
+               }
+               elem = list_first_entry(&qp->grp_list, struct rxe_mc_elem,
+                                       grp_list);
+               list_del(&elem->grp_list);
+               spin_unlock_bh(&qp->grp_lock);
+
+               grp = elem->grp;
+               spin_lock_bh(&grp->mcg_lock);
+               list_del(&elem->qp_list);
+               grp->num_qp--;
+               spin_unlock_bh(&grp->mcg_lock);
+               rxe_drop_ref(grp);
+               rxe_drop_ref(elem);
+       }
+}
+
+void rxe_mc_cleanup(void *arg)
+{
+       struct rxe_mc_grp *grp = arg;
+       struct rxe_dev *rxe = grp->rxe;
+
+       rxe_drop_key(grp);
+       rxe->ifc_ops->mcast_delete(rxe, &grp->mgid);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c

new file mode 100644 (file)

index 0000000..54b3c7c
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_mmap.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+void rxe_mmap_release(struct kref *ref)
+{
+       struct rxe_mmap_info *ip = container_of(ref,
+                                       struct rxe_mmap_info, ref);
+       struct rxe_dev *rxe = to_rdev(ip->context->device);
+
+       spin_lock_bh(&rxe->pending_lock);
+
+       if (!list_empty(&ip->pending_mmaps))
+               list_del(&ip->pending_mmaps);
+
+       spin_unlock_bh(&rxe->pending_lock);
+
+       vfree(ip->obj);         /* buf */
+       kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the memory region is mapped,
+ * to avoid releasing it.
+ */
+static void rxe_vma_open(struct vm_area_struct *vma)
+{
+       struct rxe_mmap_info *ip = vma->vm_private_data;
+
+       kref_get(&ip->ref);
+}
+
+static void rxe_vma_close(struct vm_area_struct *vma)
+{
+       struct rxe_mmap_info *ip = vma->vm_private_data;
+
+       kref_put(&ip->ref, rxe_mmap_release);
+}
+
+static struct vm_operations_struct rxe_vm_ops = {
+       .open = rxe_vma_open,
+       .close = rxe_vma_close,
+};
+
+/**
+ * rxe_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+       struct rxe_dev *rxe = to_rdev(context->device);
+       unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+       unsigned long size = vma->vm_end - vma->vm_start;
+       struct rxe_mmap_info *ip, *pp;
+       int ret;
+
+       /*
+        * Search the device's list of objects waiting for a mmap call.
+        * Normally, this list is very short since a call to create a
+        * CQ, QP, or SRQ is soon followed by a call to mmap().
+        */
+       spin_lock_bh(&rxe->pending_lock);
+       list_for_each_entry_safe(ip, pp, &rxe->pending_mmaps, pending_mmaps) {
+               if (context != ip->context || (__u64)offset != ip->info.offset)
+                       continue;
+
+               /* Don't allow a mmap larger than the object. */
+               if (size > ip->info.size) {
+                       pr_err("mmap region is larger than the object!\n");
+                       spin_unlock_bh(&rxe->pending_lock);
+                       ret = -EINVAL;
+                       goto done;
+               }
+
+               goto found_it;
+       }
+       pr_warn("unable to find pending mmap info\n");
+       spin_unlock_bh(&rxe->pending_lock);
+       ret = -EINVAL;
+       goto done;
+
+found_it:
+       list_del_init(&ip->pending_mmaps);
+       spin_unlock_bh(&rxe->pending_lock);
+
+       ret = remap_vmalloc_range(vma, ip->obj, 0);
+       if (ret) {
+               pr_err("rxe: err %d from remap_vmalloc_range\n", ret);
+               goto done;
+       }
+
+       vma->vm_ops = &rxe_vm_ops;
+       vma->vm_private_data = ip;
+       rxe_vma_open(vma);
+done:
+       return ret;
+}
+
+/*
+ * Allocate information for rxe_mmap
+ */
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe,
+                                          u32 size,
+                                          struct ib_ucontext *context,
+                                          void *obj)
+{
+       struct rxe_mmap_info *ip;
+
+       ip = kmalloc(sizeof(*ip), GFP_KERNEL);
+       if (!ip)
+               return NULL;
+
+       size = PAGE_ALIGN(size);
+
+       spin_lock_bh(&rxe->mmap_offset_lock);
+
+       if (rxe->mmap_offset == 0)
+               rxe->mmap_offset = PAGE_SIZE;
+
+       ip->info.offset = rxe->mmap_offset;
+       rxe->mmap_offset += size;
+
+       spin_unlock_bh(&rxe->mmap_offset_lock);
+
+       INIT_LIST_HEAD(&ip->pending_mmaps);
+       ip->info.size = size;
+       ip->context = context;
+       ip->obj = obj;
+       kref_init(&ip->ref);
+
+       return ip;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c

new file mode 100644 (file)

index 0000000..f3dab65
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -0,0 +1,643 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/*
+ * lfsr (linear feedback shift register) with period 255
+ */
+static u8 rxe_get_key(void)
+{
+       static unsigned key = 1;
+
+       key = key << 1;
+
+       key |= (0 != (key & 0x100)) ^ (0 != (key & 0x10))
+               ^ (0 != (key & 0x80)) ^ (0 != (key & 0x40));
+
+       key &= 0xff;
+
+       return key;
+}
+
+int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length)
+{
+       switch (mem->type) {
+       case RXE_MEM_TYPE_DMA:
+               return 0;
+
+       case RXE_MEM_TYPE_MR:
+       case RXE_MEM_TYPE_FMR:
+               return ((iova < mem->iova) ||
+                       ((iova + length) > (mem->iova + mem->length))) ?
+                       -EFAULT : 0;
+
+       default:
+               return -EFAULT;
+       }
+}
+
+#define IB_ACCESS_REMOTE       (IB_ACCESS_REMOTE_READ          \
+                               | IB_ACCESS_REMOTE_WRITE        \
+                               | IB_ACCESS_REMOTE_ATOMIC)
+
+static void rxe_mem_init(int access, struct rxe_mem *mem)
+{
+       u32 lkey = mem->pelem.index << 8 | rxe_get_key();
+       u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0;
+
+       if (mem->pelem.pool->type == RXE_TYPE_MR) {
+               mem->ibmr.lkey          = lkey;
+               mem->ibmr.rkey          = rkey;
+       }
+
+       mem->lkey               = lkey;
+       mem->rkey               = rkey;
+       mem->state              = RXE_MEM_STATE_INVALID;
+       mem->type               = RXE_MEM_TYPE_NONE;
+       mem->map_shift          = ilog2(RXE_BUF_PER_MAP);
+}
+
+void rxe_mem_cleanup(void *arg)
+{
+       struct rxe_mem *mem = arg;
+       int i;
+
+       if (mem->umem)
+               ib_umem_release(mem->umem);
+
+       if (mem->map) {
+               for (i = 0; i < mem->num_map; i++)
+                       kfree(mem->map[i]);
+
+               kfree(mem->map);
+       }
+}
+
+static int rxe_mem_alloc(struct rxe_dev *rxe, struct rxe_mem *mem, int num_buf)
+{
+       int i;
+       int num_map;
+       struct rxe_map **map = mem->map;
+
+       num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP;
+
+       mem->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL);
+       if (!mem->map)
+               goto err1;
+
+       for (i = 0; i < num_map; i++) {
+               mem->map[i] = kmalloc(sizeof(**map), GFP_KERNEL);
+               if (!mem->map[i])
+                       goto err2;
+       }
+
+       WARN_ON(!is_power_of_2(RXE_BUF_PER_MAP));
+
+       mem->map_shift  = ilog2(RXE_BUF_PER_MAP);
+       mem->map_mask   = RXE_BUF_PER_MAP - 1;
+
+       mem->num_buf = num_buf;
+       mem->num_map = num_map;
+       mem->max_buf = num_map * RXE_BUF_PER_MAP;
+
+       return 0;
+
+err2:
+       for (i--; i >= 0; i--)
+               kfree(mem->map[i]);
+
+       kfree(mem->map);
+err1:
+       return -ENOMEM;
+}
+
+int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd,
+                    int access, struct rxe_mem *mem)
+{
+       rxe_mem_init(access, mem);
+
+       mem->pd                 = pd;
+       mem->access             = access;
+       mem->state              = RXE_MEM_STATE_VALID;
+       mem->type               = RXE_MEM_TYPE_DMA;
+
+       return 0;
+}
+
+int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start,
+                     u64 length, u64 iova, int access, struct ib_udata *udata,
+                     struct rxe_mem *mem)
+{
+       int                     entry;
+       struct rxe_map          **map;
+       struct rxe_phys_buf     *buf = NULL;
+       struct ib_umem          *umem;
+       struct scatterlist      *sg;
+       int                     num_buf;
+       void                    *vaddr;
+       int err;
+
+       umem = ib_umem_get(pd->ibpd.uobject->context, start, length, access, 0);
+       if (IS_ERR(umem)) {
+               pr_warn("err %d from rxe_umem_get\n",
+                       (int)PTR_ERR(umem));
+               err = -EINVAL;
+               goto err1;
+       }
+
+       mem->umem = umem;
+       num_buf = umem->nmap;
+
+       rxe_mem_init(access, mem);
+
+       err = rxe_mem_alloc(rxe, mem, num_buf);
+       if (err) {
+               pr_warn("err %d from rxe_mem_alloc\n", err);
+               ib_umem_release(umem);
+               goto err1;
+       }
+
+       WARN_ON(!is_power_of_2(umem->page_size));
+
+       mem->page_shift         = ilog2(umem->page_size);
+       mem->page_mask          = umem->page_size - 1;
+
+       num_buf                 = 0;
+       map                     = mem->map;
+       if (length > 0) {
+               buf = map[0]->buf;
+
+               for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+                       vaddr = page_address(sg_page(sg));
+                       if (!vaddr) {
+                               pr_warn("null vaddr\n");
+                               err = -ENOMEM;
+                               goto err1;
+                       }
+
+                       buf->addr = (uintptr_t)vaddr;
+                       buf->size = umem->page_size;
+                       num_buf++;
+                       buf++;
+
+                       if (num_buf >= RXE_BUF_PER_MAP) {
+                               map++;
+                               buf = map[0]->buf;
+                               num_buf = 0;
+                       }
+               }
+       }
+
+       mem->pd                 = pd;
+       mem->umem               = umem;
+       mem->access             = access;
+       mem->length             = length;
+       mem->iova               = iova;
+       mem->va                 = start;
+       mem->offset             = ib_umem_offset(umem);
+       mem->state              = RXE_MEM_STATE_VALID;
+       mem->type               = RXE_MEM_TYPE_MR;
+
+       return 0;
+
+err1:
+       return err;
+}
+
+int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd,
+                     int max_pages, struct rxe_mem *mem)
+{
+       int err;
+
+       rxe_mem_init(0, mem);
+
+       /* In fastreg, we also set the rkey */
+       mem->ibmr.rkey = mem->ibmr.lkey;
+
+       err = rxe_mem_alloc(rxe, mem, max_pages);
+       if (err)
+               goto err1;
+
+       mem->pd                 = pd;
+       mem->max_buf            = max_pages;
+       mem->state              = RXE_MEM_STATE_FREE;
+       mem->type               = RXE_MEM_TYPE_MR;
+
+       return 0;
+
+err1:
+       return err;
+}
+
+static void lookup_iova(
+       struct rxe_mem  *mem,
+       u64                     iova,
+       int                     *m_out,
+       int                     *n_out,
+       size_t                  *offset_out)
+{
+       size_t                  offset = iova - mem->iova + mem->offset;
+       int                     map_index;
+       int                     buf_index;
+       u64                     length;
+
+       if (likely(mem->page_shift)) {
+               *offset_out = offset & mem->page_mask;
+               offset >>= mem->page_shift;
+               *n_out = offset & mem->map_mask;
+               *m_out = offset >> mem->map_shift;
+       } else {
+               map_index = 0;
+               buf_index = 0;
+
+               length = mem->map[map_index]->buf[buf_index].size;
+
+               while (offset >= length) {
+                       offset -= length;
+                       buf_index++;
+
+                       if (buf_index == RXE_BUF_PER_MAP) {
+                               map_index++;
+                               buf_index = 0;
+                       }
+                       length = mem->map[map_index]->buf[buf_index].size;
+               }
+
+               *m_out = map_index;
+               *n_out = buf_index;
+               *offset_out = offset;
+       }
+}
+
+void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length)
+{
+       size_t offset;
+       int m, n;
+       void *addr;
+
+       if (mem->state != RXE_MEM_STATE_VALID) {
+               pr_warn("mem not in valid state\n");
+               addr = NULL;
+               goto out;
+       }
+
+       if (!mem->map) {
+               addr = (void *)(uintptr_t)iova;
+               goto out;
+       }
+
+       if (mem_check_range(mem, iova, length)) {
+               pr_warn("range violation\n");
+               addr = NULL;
+               goto out;
+       }
+
+       lookup_iova(mem, iova, &m, &n, &offset);
+
+       if (offset + length > mem->map[m]->buf[n].size) {
+               pr_warn("crosses page boundary\n");
+               addr = NULL;
+               goto out;
+       }
+
+       addr = (void *)(uintptr_t)mem->map[m]->buf[n].addr + offset;
+
+out:
+       return addr;
+}
+
+/* copy data from a range (vaddr, vaddr+length-1) to or from
+ * a mem object starting at iova. Compute incremental value of
+ * crc32 if crcp is not zero. caller must hold a reference to mem
+ */
+int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length,
+                enum copy_direction dir, u32 *crcp)
+{
+       int                     err;
+       int                     bytes;
+       u8                      *va;
+       struct rxe_map          **map;
+       struct rxe_phys_buf     *buf;
+       int                     m;
+       int                     i;
+       size_t                  offset;
+       u32                     crc = crcp ? (*crcp) : 0;
+
+       if (mem->type == RXE_MEM_TYPE_DMA) {
+               u8 *src, *dest;
+
+               src  = (dir == to_mem_obj) ?
+                       addr : ((void *)(uintptr_t)iova);
+
+               dest = (dir == to_mem_obj) ?
+                       ((void *)(uintptr_t)iova) : addr;
+
+               if (crcp)
+                       *crcp = crc32_le(*crcp, src, length);
+
+               memcpy(dest, src, length);
+
+               return 0;
+       }
+
+       WARN_ON(!mem->map);
+
+       err = mem_check_range(mem, iova, length);
+       if (err) {
+               err = -EFAULT;
+               goto err1;
+       }
+
+       lookup_iova(mem, iova, &m, &i, &offset);
+
+       map     = mem->map + m;
+       buf     = map[0]->buf + i;
+
+       while (length > 0) {
+               u8 *src, *dest;
+
+               va      = (u8 *)(uintptr_t)buf->addr + offset;
+               src  = (dir == to_mem_obj) ? addr : va;
+               dest = (dir == to_mem_obj) ? va : addr;
+
+               bytes   = buf->size - offset;
+
+               if (bytes > length)
+                       bytes = length;
+
+               if (crcp)
+                       crc = crc32_le(crc, src, bytes);
+
+               memcpy(dest, src, bytes);
+
+               length  -= bytes;
+               addr    += bytes;
+
+               offset  = 0;
+               buf++;
+               i++;
+
+               if (i == RXE_BUF_PER_MAP) {
+                       i = 0;
+                       map++;
+                       buf = map[0]->buf;
+               }
+       }
+
+       if (crcp)
+               *crcp = crc;
+
+       return 0;
+
+err1:
+       return err;
+}
+
+/* copy data in or out of a wqe, i.e. sg list
+ * under the control of a dma descriptor
+ */
+int copy_data(
+       struct rxe_dev          *rxe,
+       struct rxe_pd           *pd,
+       int                     access,
+       struct rxe_dma_info     *dma,
+       void                    *addr,
+       int                     length,
+       enum copy_direction     dir,
+       u32                     *crcp)
+{
+       int                     bytes;
+       struct rxe_sge          *sge    = &dma->sge[dma->cur_sge];
+       int                     offset  = dma->sge_offset;
+       int                     resid   = dma->resid;
+       struct rxe_mem          *mem    = NULL;
+       u64                     iova;
+       int                     err;
+
+       if (length == 0)
+               return 0;
+
+       if (length > resid) {
+               err = -EINVAL;
+               goto err2;
+       }
+
+       if (sge->length && (offset < sge->length)) {
+               mem = lookup_mem(pd, access, sge->lkey, lookup_local);
+               if (!mem) {
+                       err = -EINVAL;
+                       goto err1;
+               }
+       }
+
+       while (length > 0) {
+               bytes = length;
+
+               if (offset >= sge->length) {
+                       if (mem) {
+                               rxe_drop_ref(mem);
+                               mem = NULL;
+                       }
+                       sge++;
+                       dma->cur_sge++;
+                       offset = 0;
+
+                       if (dma->cur_sge >= dma->num_sge) {
+                               err = -ENOSPC;
+                               goto err2;
+                       }
+
+                       if (sge->length) {
+                               mem = lookup_mem(pd, access, sge->lkey,
+                                                lookup_local);
+                               if (!mem) {
+                                       err = -EINVAL;
+                                       goto err1;
+                               }
+                       } else {
+                               continue;
+                       }
+               }
+
+               if (bytes > sge->length - offset)
+                       bytes = sge->length - offset;
+
+               if (bytes > 0) {
+                       iova = sge->addr + offset;
+
+                       err = rxe_mem_copy(mem, iova, addr, bytes, dir, crcp);
+                       if (err)
+                               goto err2;
+
+                       offset  += bytes;
+                       resid   -= bytes;
+                       length  -= bytes;
+                       addr    += bytes;
+               }
+       }
+
+       dma->sge_offset = offset;
+       dma->resid      = resid;
+
+       if (mem)
+               rxe_drop_ref(mem);
+
+       return 0;
+
+err2:
+       if (mem)
+               rxe_drop_ref(mem);
+err1:
+       return err;
+}
+
+int advance_dma_data(struct rxe_dma_info *dma, unsigned int length)
+{
+       struct rxe_sge          *sge    = &dma->sge[dma->cur_sge];
+       int                     offset  = dma->sge_offset;
+       int                     resid   = dma->resid;
+
+       while (length) {
+               unsigned int bytes;
+
+               if (offset >= sge->length) {
+                       sge++;
+                       dma->cur_sge++;
+                       offset = 0;
+                       if (dma->cur_sge >= dma->num_sge)
+                               return -ENOSPC;
+               }
+
+               bytes = length;
+
+               if (bytes > sge->length - offset)
+                       bytes = sge->length - offset;
+
+               offset  += bytes;
+               resid   -= bytes;
+               length  -= bytes;
+       }
+
+       dma->sge_offset = offset;
+       dma->resid      = resid;
+
+       return 0;
+}
+
+/* (1) find the mem (mr or mw) corresponding to lkey/rkey
+ *     depending on lookup_type
+ * (2) verify that the (qp) pd matches the mem pd
+ * (3) verify that the mem can support the requested access
+ * (4) verify that mem state is valid
+ */
+struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key,
+                          enum lookup_type type)
+{
+       struct rxe_mem *mem;
+       struct rxe_dev *rxe = to_rdev(pd->ibpd.device);
+       int index = key >> 8;
+
+       if (index >= RXE_MIN_MR_INDEX && index <= RXE_MAX_MR_INDEX) {
+               mem = rxe_pool_get_index(&rxe->mr_pool, index);
+               if (!mem)
+                       goto err1;
+       } else {
+               goto err1;
+       }
+
+       if ((type == lookup_local && mem->lkey != key) ||
+           (type == lookup_remote && mem->rkey != key))
+               goto err2;
+
+       if (mem->pd != pd)
+               goto err2;
+
+       if (access && !(access & mem->access))
+               goto err2;
+
+       if (mem->state != RXE_MEM_STATE_VALID)
+               goto err2;
+
+       return mem;
+
+err2:
+       rxe_drop_ref(mem);
+err1:
+       return NULL;
+}
+
+int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem,
+                     u64 *page, int num_pages, u64 iova)
+{
+       int i;
+       int num_buf;
+       int err;
+       struct rxe_map **map;
+       struct rxe_phys_buf *buf;
+       int page_size;
+
+       if (num_pages > mem->max_buf) {
+               err = -EINVAL;
+               goto err1;
+       }
+
+       num_buf         = 0;
+       page_size       = 1 << mem->page_shift;
+       map             = mem->map;
+       buf             = map[0]->buf;
+
+       for (i = 0; i < num_pages; i++) {
+               buf->addr = *page++;
+               buf->size = page_size;
+               buf++;
+               num_buf++;
+
+               if (num_buf == RXE_BUF_PER_MAP) {
+                       map++;
+                       buf = map[0]->buf;
+                       num_buf = 0;
+               }
+       }
+
+       mem->iova       = iova;
+       mem->va         = iova;
+       mem->length     = num_pages << mem->page_shift;
+       mem->state      = RXE_MEM_STATE_VALID;
+
+       return 0;
+
+err1:
+       return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c

new file mode 100644 (file)

index 0000000..0b8d2ea
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -0,0 +1,708 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <net/udp_tunnel.h>
+#include <net/sch_generic.h>
+#include <linux/netfilter.h>
+#include <rdma/ib_addr.h>
+
+#include "rxe.h"
+#include "rxe_net.h"
+#include "rxe_loc.h"
+
+static LIST_HEAD(rxe_dev_list);
+static spinlock_t dev_list_lock; /* spinlock for device list */
+
+struct rxe_dev *net_to_rxe(struct net_device *ndev)
+{
+       struct rxe_dev *rxe;
+       struct rxe_dev *found = NULL;
+
+       spin_lock_bh(&dev_list_lock);
+       list_for_each_entry(rxe, &rxe_dev_list, list) {
+               if (rxe->ndev == ndev) {
+                       found = rxe;
+                       break;
+               }
+       }
+       spin_unlock_bh(&dev_list_lock);
+
+       return found;
+}
+
+struct rxe_dev *get_rxe_by_name(const char* name)
+{
+       struct rxe_dev *rxe;
+       struct rxe_dev *found = NULL;
+
+       spin_lock_bh(&dev_list_lock);
+       list_for_each_entry(rxe, &rxe_dev_list, list) {
+               if (!strcmp(name, rxe->ib_dev.name)) {
+                       found = rxe;
+                       break;
+               }
+       }
+       spin_unlock_bh(&dev_list_lock);
+       return found;
+}
+
+
+struct rxe_recv_sockets recv_sockets;
+
+static __be64 rxe_mac_to_eui64(struct net_device *ndev)
+{
+       unsigned char *mac_addr = ndev->dev_addr;
+       __be64 eui64;
+       unsigned char *dst = (unsigned char *)&eui64;
+
+       dst[0] = mac_addr[0] ^ 2;
+       dst[1] = mac_addr[1];
+       dst[2] = mac_addr[2];
+       dst[3] = 0xff;
+       dst[4] = 0xfe;
+       dst[5] = mac_addr[3];
+       dst[6] = mac_addr[4];
+       dst[7] = mac_addr[5];
+
+       return eui64;
+}
+
+static __be64 node_guid(struct rxe_dev *rxe)
+{
+       return rxe_mac_to_eui64(rxe->ndev);
+}
+
+static __be64 port_guid(struct rxe_dev *rxe)
+{
+       return rxe_mac_to_eui64(rxe->ndev);
+}
+
+static struct device *dma_device(struct rxe_dev *rxe)
+{
+       struct net_device *ndev;
+
+       ndev = rxe->ndev;
+
+       if (ndev->priv_flags & IFF_802_1Q_VLAN)
+               ndev = vlan_dev_real_dev(ndev);
+
+       return ndev->dev.parent;
+}
+
+static int mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+       int err;
+       unsigned char ll_addr[ETH_ALEN];
+
+       ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+       err = dev_mc_add(rxe->ndev, ll_addr);
+
+       return err;
+}
+
+static int mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+       int err;
+       unsigned char ll_addr[ETH_ALEN];
+
+       ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+       err = dev_mc_del(rxe->ndev, ll_addr);
+
+       return err;
+}
+
+static struct dst_entry *rxe_find_route4(struct net_device *ndev,
+                                 struct in_addr *saddr,
+                                 struct in_addr *daddr)
+{
+       struct rtable *rt;
+       struct flowi4 fl = { { 0 } };
+
+       memset(&fl, 0, sizeof(fl));
+       fl.flowi4_oif = ndev->ifindex;
+       memcpy(&fl.saddr, saddr, sizeof(*saddr));
+       memcpy(&fl.daddr, daddr, sizeof(*daddr));
+       fl.flowi4_proto = IPPROTO_UDP;
+
+       rt = ip_route_output_key(&init_net, &fl);
+       if (IS_ERR(rt)) {
+               pr_err_ratelimited("no route to %pI4\n", &daddr->s_addr);
+               return NULL;
+       }
+
+       return &rt->dst;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct dst_entry *rxe_find_route6(struct net_device *ndev,
+                                        struct in6_addr *saddr,
+                                        struct in6_addr *daddr)
+{
+       struct dst_entry *ndst;
+       struct flowi6 fl6 = { { 0 } };
+
+       memset(&fl6, 0, sizeof(fl6));
+       fl6.flowi6_oif = ndev->ifindex;
+       memcpy(&fl6.saddr, saddr, sizeof(*saddr));
+       memcpy(&fl6.daddr, daddr, sizeof(*daddr));
+       fl6.flowi6_proto = IPPROTO_UDP;
+
+       if (unlikely(ipv6_stub->ipv6_dst_lookup(sock_net(recv_sockets.sk6->sk),
+                                               recv_sockets.sk6->sk, &ndst, &fl6))) {
+               pr_err_ratelimited("no route to %pI6\n", daddr);
+               goto put;
+       }
+
+       if (unlikely(ndst->error)) {
+               pr_err("no route to %pI6\n", daddr);
+               goto put;
+       }
+
+       return ndst;
+put:
+       dst_release(ndst);
+       return NULL;
+}
+
+#else
+
+static struct dst_entry *rxe_find_route6(struct net_device *ndev,
+                                        struct in6_addr *saddr,
+                                        struct in6_addr *daddr)
+{
+       return NULL;
+}
+
+#endif
+
+static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+       struct udphdr *udph;
+       struct net_device *ndev = skb->dev;
+       struct rxe_dev *rxe = net_to_rxe(ndev);
+       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+       if (!rxe)
+               goto drop;
+
+       if (skb_linearize(skb)) {
+               pr_err("skb_linearize failed\n");
+               goto drop;
+       }
+
+       udph = udp_hdr(skb);
+       pkt->rxe = rxe;
+       pkt->port_num = 1;
+       pkt->hdr = (u8 *)(udph + 1);
+       pkt->mask = RXE_GRH_MASK;
+       pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph);
+
+       return rxe_rcv(skb);
+drop:
+       kfree_skb(skb);
+       return 0;
+}
+
+static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port,
+                                          bool ipv6)
+{
+       int err;
+       struct socket *sock;
+       struct udp_port_cfg udp_cfg;
+       struct udp_tunnel_sock_cfg tnl_cfg;
+
+       memset(&udp_cfg, 0, sizeof(udp_cfg));
+
+       if (ipv6) {
+               udp_cfg.family = AF_INET6;
+               udp_cfg.ipv6_v6only = 1;
+       } else {
+               udp_cfg.family = AF_INET;
+       }
+
+       udp_cfg.local_udp_port = port;
+
+       /* Create UDP socket */
+       err = udp_sock_create(net, &udp_cfg, &sock);
+       if (err < 0) {
+               pr_err("failed to create udp socket. err = %d\n", err);
+               return ERR_PTR(err);
+       }
+
+       tnl_cfg.sk_user_data = NULL;
+       tnl_cfg.encap_type = 1;
+       tnl_cfg.encap_rcv = rxe_udp_encap_recv;
+       tnl_cfg.encap_destroy = NULL;
+
+       /* Setup UDP tunnel */
+       setup_udp_tunnel_sock(net, sock, &tnl_cfg);
+
+       return sock;
+}
+
+static void rxe_release_udp_tunnel(struct socket *sk)
+{
+       udp_tunnel_sock_release(sk);
+}
+
+static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port,
+                           __be16 dst_port)
+{
+       struct udphdr *udph;
+
+       __skb_push(skb, sizeof(*udph));
+       skb_reset_transport_header(skb);
+       udph = udp_hdr(skb);
+
+       udph->dest = dst_port;
+       udph->source = src_port;
+       udph->len = htons(skb->len);
+       udph->check = 0;
+}
+
+static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb,
+                            __be32 saddr, __be32 daddr, __u8 proto,
+                            __u8 tos, __u8 ttl, __be16 df, bool xnet)
+{
+       struct iphdr *iph;
+
+       skb_scrub_packet(skb, xnet);
+
+       skb_clear_hash(skb);
+       skb_dst_set(skb, dst);
+       memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+       skb_push(skb, sizeof(struct iphdr));
+       skb_reset_network_header(skb);
+
+       iph = ip_hdr(skb);
+
+       iph->version    =       IPVERSION;
+       iph->ihl        =       sizeof(struct iphdr) >> 2;
+       iph->frag_off   =       df;
+       iph->protocol   =       proto;
+       iph->tos        =       tos;
+       iph->daddr      =       daddr;
+       iph->saddr      =       saddr;
+       iph->ttl        =       ttl;
+       __ip_select_ident(dev_net(dst->dev), iph,
+                         skb_shinfo(skb)->gso_segs ?: 1);
+       iph->tot_len = htons(skb->len);
+       ip_send_check(iph);
+}
+
+static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb,
+                            struct in6_addr *saddr, struct in6_addr *daddr,
+                            __u8 proto, __u8 prio, __u8 ttl)
+{
+       struct ipv6hdr *ip6h;
+
+       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+       IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED
+                           | IPSKB_REROUTED);
+       skb_dst_set(skb, dst);
+
+       __skb_push(skb, sizeof(*ip6h));
+       skb_reset_network_header(skb);
+       ip6h              = ipv6_hdr(skb);
+       ip6_flow_hdr(ip6h, prio, htonl(0));
+       ip6h->payload_len = htons(skb->len);
+       ip6h->nexthdr     = proto;
+       ip6h->hop_limit   = ttl;
+       ip6h->daddr       = *daddr;
+       ip6h->saddr       = *saddr;
+       ip6h->payload_len = htons(skb->len - sizeof(*ip6h));
+}
+
+static int prepare4(struct rxe_dev *rxe, struct sk_buff *skb, struct rxe_av *av)
+{
+       struct dst_entry *dst;
+       bool xnet = false;
+       __be16 df = htons(IP_DF);
+       struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr;
+       struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr;
+       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+       dst = rxe_find_route4(rxe->ndev, saddr, daddr);
+       if (!dst) {
+               pr_err("Host not reachable\n");
+               return -EHOSTUNREACH;
+       }
+
+       if (!memcmp(saddr, daddr, sizeof(*daddr)))
+               pkt->mask |= RXE_LOOPBACK_MASK;
+
+       prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT),
+                       htons(ROCE_V2_UDP_DPORT));
+
+       prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP,
+                        av->grh.traffic_class, av->grh.hop_limit, df, xnet);
+       return 0;
+}
+
+static int prepare6(struct rxe_dev *rxe, struct sk_buff *skb, struct rxe_av *av)
+{
+       struct dst_entry *dst;
+       struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr;
+       struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr;
+       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+       dst = rxe_find_route6(rxe->ndev, saddr, daddr);
+       if (!dst) {
+               pr_err("Host not reachable\n");
+               return -EHOSTUNREACH;
+       }
+
+       if (!memcmp(saddr, daddr, sizeof(*daddr)))
+               pkt->mask |= RXE_LOOPBACK_MASK;
+
+       prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT),
+                       htons(ROCE_V2_UDP_DPORT));
+
+       prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP,
+                        av->grh.traffic_class,
+                        av->grh.hop_limit);
+       return 0;
+}
+
+static int prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+                  struct sk_buff *skb, u32 *crc)
+{
+       int err = 0;
+       struct rxe_av *av = rxe_get_av(pkt);
+
+       if (av->network_type == RDMA_NETWORK_IPV4)
+               err = prepare4(rxe, skb, av);
+       else if (av->network_type == RDMA_NETWORK_IPV6)
+               err = prepare6(rxe, skb, av);
+
+       *crc = rxe_icrc_hdr(pkt, skb);
+
+       return err;
+}
+
+static void rxe_skb_tx_dtor(struct sk_buff *skb)
+{
+       struct sock *sk = skb->sk;
+       struct rxe_qp *qp = sk->sk_user_data;
+       int skb_out = atomic_dec_return(&qp->skb_out);
+
+       if (unlikely(qp->need_req_skb &&
+                    skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW))
+               rxe_run_task(&qp->req.task, 1);
+}
+
+static int send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+               struct sk_buff *skb)
+{
+       struct sk_buff *nskb;
+       struct rxe_av *av;
+       int err;
+
+       av = rxe_get_av(pkt);
+
+       nskb = skb_clone(skb, GFP_ATOMIC);
+       if (!nskb)
+               return -ENOMEM;
+
+       nskb->destructor = rxe_skb_tx_dtor;
+       nskb->sk = pkt->qp->sk->sk;
+
+       if (av->network_type == RDMA_NETWORK_IPV4) {
+               err = ip_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb);
+       } else if (av->network_type == RDMA_NETWORK_IPV6) {
+               err = ip6_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb);
+       } else {
+               pr_err("Unknown layer 3 protocol: %d\n", av->network_type);
+               kfree_skb(nskb);
+               return -EINVAL;
+       }
+
+       if (unlikely(net_xmit_eval(err))) {
+               pr_debug("error sending packet: %d\n", err);
+               return -EAGAIN;
+       }
+
+       kfree_skb(skb);
+
+       return 0;
+}
+
+static int loopback(struct sk_buff *skb)
+{
+       return rxe_rcv(skb);
+}
+
+static inline int addr_same(struct rxe_dev *rxe, struct rxe_av *av)
+{
+       return rxe->port.port_guid == av->grh.dgid.global.interface_id;
+}
+
+static struct sk_buff *init_packet(struct rxe_dev *rxe, struct rxe_av *av,
+                                  int paylen, struct rxe_pkt_info *pkt)
+{
+       unsigned int hdr_len;
+       struct sk_buff *skb;
+
+       if (av->network_type == RDMA_NETWORK_IPV4)
+               hdr_len = ETH_HLEN + sizeof(struct udphdr) +
+                       sizeof(struct iphdr);
+       else
+               hdr_len = ETH_HLEN + sizeof(struct udphdr) +
+                       sizeof(struct ipv6hdr);
+
+       skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(rxe->ndev),
+                       GFP_ATOMIC);
+       if (unlikely(!skb))
+               return NULL;
+
+       skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(rxe->ndev));
+
+       skb->dev        = rxe->ndev;
+       if (av->network_type == RDMA_NETWORK_IPV4)
+               skb->protocol = htons(ETH_P_IP);
+       else
+               skb->protocol = htons(ETH_P_IPV6);
+
+       pkt->rxe        = rxe;
+       pkt->port_num   = 1;
+       pkt->hdr        = skb_put(skb, paylen);
+       pkt->mask       |= RXE_GRH_MASK;
+
+       memset(pkt->hdr, 0, paylen);
+
+       return skb;
+}
+
+/*
+ * this is required by rxe_cfg to match rxe devices in
+ * /sys/class/infiniband up with their underlying ethernet devices
+ */
+static char *parent_name(struct rxe_dev *rxe, unsigned int port_num)
+{
+       return rxe->ndev->name;
+}
+
+static enum rdma_link_layer link_layer(struct rxe_dev *rxe,
+                                      unsigned int port_num)
+{
+       return IB_LINK_LAYER_ETHERNET;
+}
+
+static struct rxe_ifc_ops ifc_ops = {
+       .node_guid      = node_guid,
+       .port_guid      = port_guid,
+       .dma_device     = dma_device,
+       .mcast_add      = mcast_add,
+       .mcast_delete   = mcast_delete,
+       .prepare        = prepare,
+       .send           = send,
+       .loopback       = loopback,
+       .init_packet    = init_packet,
+       .parent_name    = parent_name,
+       .link_layer     = link_layer,
+};
+
+struct rxe_dev *rxe_net_add(struct net_device *ndev)
+{
+       int err;
+       struct rxe_dev *rxe = NULL;
+
+       rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
+       if (!rxe)
+               return NULL;
+
+       rxe->ifc_ops = &ifc_ops;
+       rxe->ndev = ndev;
+
+       err = rxe_add(rxe, ndev->mtu);
+       if (err) {
+               ib_dealloc_device(&rxe->ib_dev);
+               return NULL;
+       }
+
+       spin_lock_bh(&dev_list_lock);
+       list_add_tail(&rxe_dev_list, &rxe->list);
+       spin_unlock_bh(&dev_list_lock);
+       return rxe;
+}
+
+void rxe_remove_all(void)
+{
+       spin_lock_bh(&dev_list_lock);
+       while (!list_empty(&rxe_dev_list)) {
+               struct rxe_dev *rxe =
+                       list_first_entry(&rxe_dev_list, struct rxe_dev, list);
+
+               list_del(&rxe->list);
+               spin_unlock_bh(&dev_list_lock);
+               rxe_remove(rxe);
+               spin_lock_bh(&dev_list_lock);
+       }
+       spin_unlock_bh(&dev_list_lock);
+}
+EXPORT_SYMBOL(rxe_remove_all);
+
+static void rxe_port_event(struct rxe_dev *rxe,
+                          enum ib_event_type event)
+{
+       struct ib_event ev;
+
+       ev.device = &rxe->ib_dev;
+       ev.element.port_num = 1;
+       ev.event = event;
+
+       ib_dispatch_event(&ev);
+}
+
+/* Caller must hold net_info_lock */
+void rxe_port_up(struct rxe_dev *rxe)
+{
+       struct rxe_port *port;
+
+       port = &rxe->port;
+       port->attr.state = IB_PORT_ACTIVE;
+       port->attr.phys_state = IB_PHYS_STATE_LINK_UP;
+
+       rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE);
+       pr_info("rxe: set %s active\n", rxe->ib_dev.name);
+       return;
+}
+
+/* Caller must hold net_info_lock */
+void rxe_port_down(struct rxe_dev *rxe)
+{
+       struct rxe_port *port;
+
+       port = &rxe->port;
+       port->attr.state = IB_PORT_DOWN;
+       port->attr.phys_state = IB_PHYS_STATE_LINK_DOWN;
+
+       rxe_port_event(rxe, IB_EVENT_PORT_ERR);
+       pr_info("rxe: set %s down\n", rxe->ib_dev.name);
+       return;
+}
+
+static int rxe_notify(struct notifier_block *not_blk,
+                     unsigned long event,
+                     void *arg)
+{
+       struct net_device *ndev = netdev_notifier_info_to_dev(arg);
+       struct rxe_dev *rxe = net_to_rxe(ndev);
+
+       if (!rxe)
+               goto out;
+
+       switch (event) {
+       case NETDEV_UNREGISTER:
+               list_del(&rxe->list);
+               rxe_remove(rxe);
+               break;
+       case NETDEV_UP:
+               rxe_port_up(rxe);
+               break;
+       case NETDEV_DOWN:
+               rxe_port_down(rxe);
+               break;
+       case NETDEV_CHANGEMTU:
+               pr_info("rxe: %s changed mtu to %d\n", ndev->name, ndev->mtu);
+               rxe_set_mtu(rxe, ndev->mtu);
+               break;
+       case NETDEV_REBOOT:
+       case NETDEV_CHANGE:
+       case NETDEV_GOING_DOWN:
+       case NETDEV_CHANGEADDR:
+       case NETDEV_CHANGENAME:
+       case NETDEV_FEAT_CHANGE:
+       default:
+               pr_info("rxe: ignoring netdev event = %ld for %s\n",
+                       event, ndev->name);
+               break;
+       }
+out:
+       return NOTIFY_OK;
+}
+
+static struct notifier_block rxe_net_notifier = {
+       .notifier_call = rxe_notify,
+};
+
+int rxe_net_init(void)
+{
+       int err;
+
+       spin_lock_init(&dev_list_lock);
+
+       recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net,
+                       htons(ROCE_V2_UDP_DPORT), true);
+       if (IS_ERR(recv_sockets.sk6)) {
+               recv_sockets.sk6 = NULL;
+               pr_err("rxe: Failed to create IPv6 UDP tunnel\n");
+               return -1;
+       }
+
+       recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net,
+                       htons(ROCE_V2_UDP_DPORT), false);
+       if (IS_ERR(recv_sockets.sk4)) {
+               rxe_release_udp_tunnel(recv_sockets.sk6);
+               recv_sockets.sk4 = NULL;
+               recv_sockets.sk6 = NULL;
+               pr_err("rxe: Failed to create IPv4 UDP tunnel\n");
+               return -1;
+       }
+
+       err = register_netdevice_notifier(&rxe_net_notifier);
+       if (err) {
+               rxe_release_udp_tunnel(recv_sockets.sk6);
+               rxe_release_udp_tunnel(recv_sockets.sk4);
+               pr_err("rxe: Failed to rigister netdev notifier\n");
+       }
+
+       return err;
+}
+
+void rxe_net_exit(void)
+{
+       if (recv_sockets.sk6)
+               rxe_release_udp_tunnel(recv_sockets.sk6);
+
+       if (recv_sockets.sk4)
+               rxe_release_udp_tunnel(recv_sockets.sk4);
+
+       unregister_netdevice_notifier(&rxe_net_notifier);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h

new file mode 100644 (file)

index 0000000..7b06f76
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_NET_H
+#define RXE_NET_H
+
+#include <net/sock.h>
+#include <net/if_inet6.h>
+#include <linux/module.h>
+
+struct rxe_recv_sockets {
+       struct socket *sk4;
+       struct socket *sk6;
+};
+
+extern struct rxe_recv_sockets recv_sockets;
+
+struct rxe_dev *rxe_net_add(struct net_device *ndev);
+
+int rxe_net_init(void);
+void rxe_net_exit(void);
+
+#endif /* RXE_NET_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c

new file mode 100644 (file)

index 0000000..61927c1
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_opcode.c
@@ -0,0 +1,961 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_pack.h>
+#include "rxe_opcode.h"
+#include "rxe_hdr.h"
+
+/* useful information about work request opcodes and pkt opcodes in
+ * table form
+ */
+struct rxe_wr_opcode_info rxe_wr_opcode_info[] = {
+       [IB_WR_RDMA_WRITE]                              = {
+               .name   = "IB_WR_RDMA_WRITE",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_INLINE_MASK | WR_WRITE_MASK,
+                       [IB_QPT_UC]     = WR_INLINE_MASK | WR_WRITE_MASK,
+               },
+       },
+       [IB_WR_RDMA_WRITE_WITH_IMM]                     = {
+               .name   = "IB_WR_RDMA_WRITE_WITH_IMM",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_INLINE_MASK | WR_WRITE_MASK,
+                       [IB_QPT_UC]     = WR_INLINE_MASK | WR_WRITE_MASK,
+               },
+       },
+       [IB_WR_SEND]                                    = {
+               .name   = "IB_WR_SEND",
+               .mask   = {
+                       [IB_QPT_SMI]    = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_GSI]    = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_RC]     = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_UC]     = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_UD]     = WR_INLINE_MASK | WR_SEND_MASK,
+               },
+       },
+       [IB_WR_SEND_WITH_IMM]                           = {
+               .name   = "IB_WR_SEND_WITH_IMM",
+               .mask   = {
+                       [IB_QPT_SMI]    = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_GSI]    = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_RC]     = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_UC]     = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_UD]     = WR_INLINE_MASK | WR_SEND_MASK,
+               },
+       },
+       [IB_WR_RDMA_READ]                               = {
+               .name   = "IB_WR_RDMA_READ",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_READ_MASK,
+               },
+       },
+       [IB_WR_ATOMIC_CMP_AND_SWP]                      = {
+               .name   = "IB_WR_ATOMIC_CMP_AND_SWP",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_ATOMIC_MASK,
+               },
+       },
+       [IB_WR_ATOMIC_FETCH_AND_ADD]                    = {
+               .name   = "IB_WR_ATOMIC_FETCH_AND_ADD",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_ATOMIC_MASK,
+               },
+       },
+       [IB_WR_LSO]                                     = {
+               .name   = "IB_WR_LSO",
+               .mask   = {
+                       /* not supported */
+               },
+       },
+       [IB_WR_SEND_WITH_INV]                           = {
+               .name   = "IB_WR_SEND_WITH_INV",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_UC]     = WR_INLINE_MASK | WR_SEND_MASK,
+                       [IB_QPT_UD]     = WR_INLINE_MASK | WR_SEND_MASK,
+               },
+       },
+       [IB_WR_RDMA_READ_WITH_INV]                      = {
+               .name   = "IB_WR_RDMA_READ_WITH_INV",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_READ_MASK,
+               },
+       },
+       [IB_WR_LOCAL_INV]                               = {
+               .name   = "IB_WR_LOCAL_INV",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_REG_MASK,
+               },
+       },
+       [IB_WR_REG_MR]                                  = {
+               .name   = "IB_WR_REG_MR",
+               .mask   = {
+                       [IB_QPT_RC]     = WR_REG_MASK,
+               },
+       },
+};
+
+struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = {
+       [IB_OPCODE_RC_SEND_FIRST]                       = {
+               .name   = "IB_OPCODE_RC_SEND_FIRST",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK
+                               | RXE_SEND_MASK | RXE_START_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_MIDDLE]              = {
+               .name   = "IB_OPCODE_RC_SEND_MIDDLE]",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_LAST]                        = {
+               .name   = "IB_OPCODE_RC_SEND_LAST",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+                               | RXE_SEND_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE",
+               .mask   = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_ONLY]                        = {
+               .name   = "IB_OPCODE_RC_SEND_ONLY",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+                               | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_WRITE_FIRST]         = {
+               .name   = "IB_OPCODE_RC_RDMA_WRITE_FIRST",
+               .mask   = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_START_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]                = {
+               .name   = "IB_OPCODE_RC_RDMA_WRITE_MIDDLE",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_WRITE_LAST]                  = {
+               .name   = "IB_OPCODE_RC_RDMA_WRITE_LAST",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE]           = {
+               .name   = "IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+               .mask   = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY]                  = {
+               .name   = "IB_OPCODE_RC_RDMA_WRITE_ONLY",
+               .mask   = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_START_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE]           = {
+               .name   = "IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_READ_REQUEST]                        = {
+               .name   = "IB_OPCODE_RC_RDMA_READ_REQUEST",
+               .mask   = RXE_RETH_MASK | RXE_REQ_MASK | RXE_READ_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]         = {
+               .name   = "IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST",
+               .mask   = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+                               | RXE_START_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_AETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]                = {
+               .name   = "IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE",
+               .mask   = RXE_PAYLOAD_MASK | RXE_ACK_MASK | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]          = {
+               .name   = "IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST",
+               .mask   = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_AETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]          = {
+               .name   = "IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY",
+               .mask   = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_AETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_ACKNOWLEDGE]                      = {
+               .name   = "IB_OPCODE_RC_ACKNOWLEDGE",
+               .mask   = RXE_AETH_MASK | RXE_ACK_MASK | RXE_START_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_AETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]                       = {
+               .name   = "IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE",
+               .mask   = RXE_AETH_MASK | RXE_ATMACK_MASK | RXE_ACK_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_AETH]      = RXE_BTH_BYTES,
+                       [RXE_ATMACK]    = RXE_BTH_BYTES
+                                               + RXE_AETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                       + RXE_ATMACK_BYTES + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_COMPARE_SWAP]                     = {
+               .name   = "IB_OPCODE_RC_COMPARE_SWAP",
+               .mask   = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_ATMETH]    = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_ATMETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_FETCH_ADD]                        = {
+               .name   = "IB_OPCODE_RC_FETCH_ADD",
+               .mask   = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_ATMETH]    = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_ATMETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]                = {
+               .name   = "IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE",
+               .mask   = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]                = {
+               .name   = "IB_OPCODE_RC_SEND_ONLY_INV",
+               .mask   = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IETH_BYTES,
+               }
+       },
+
+       /* UC */
+       [IB_OPCODE_UC_SEND_FIRST]                       = {
+               .name   = "IB_OPCODE_UC_SEND_FIRST",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK
+                               | RXE_SEND_MASK | RXE_START_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_SEND_MIDDLE]              = {
+               .name   = "IB_OPCODE_UC_SEND_MIDDLE",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_SEND_LAST]                        = {
+               .name   = "IB_OPCODE_UC_SEND_LAST",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+                               | RXE_SEND_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE",
+               .mask   = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_SEND_ONLY]                        = {
+               .name   = "IB_OPCODE_UC_SEND_ONLY",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+                               | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_RDMA_WRITE_FIRST]         = {
+               .name   = "IB_OPCODE_UC_RDMA_WRITE_FIRST",
+               .mask   = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_START_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]                = {
+               .name   = "IB_OPCODE_UC_RDMA_WRITE_MIDDLE",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_RDMA_WRITE_LAST]                  = {
+               .name   = "IB_OPCODE_UC_RDMA_WRITE_LAST",
+               .mask   = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE]           = {
+               .name   = "IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+               .mask   = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY]                  = {
+               .name   = "IB_OPCODE_UC_RDMA_WRITE_ONLY",
+               .mask   = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_START_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE]           = {
+               .name   = "IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RETH]      = RXE_BTH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+
+       /* RD */
+       [IB_OPCODE_RD_SEND_FIRST]                       = {
+               .name   = "IB_OPCODE_RD_SEND_FIRST",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_SEND_MIDDLE]              = {
+               .name   = "IB_OPCODE_RD_SEND_MIDDLE",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_SEND_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_SEND_LAST]                        = {
+               .name   = "IB_OPCODE_RD_SEND_LAST",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_COMP_MASK | RXE_SEND_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK
+                               | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_SEND_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_SEND_ONLY]                        = {
+               .name   = "IB_OPCODE_RD_SEND_ONLY",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK
+                               | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_WRITE_FIRST]         = {
+               .name   = "IB_OPCODE_RD_RDMA_WRITE_FIRST",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+                               | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_START_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_RETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_WRITE_MIDDLE]                = {
+               .name   = "IB_OPCODE_RD_RDMA_WRITE_MIDDLE",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_WRITE_LAST]                  = {
+               .name   = "IB_OPCODE_RD_RDMA_WRITE_LAST",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE]           = {
+               .name   = "IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK
+                               | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_WRITE_ONLY]                  = {
+               .name   = "IB_OPCODE_RD_RDMA_WRITE_ONLY",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+                               | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_WRITE_MASK | RXE_START_MASK
+                               | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_RETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_RETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE]           = {
+               .name   = "IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+                               | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_WRITE_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES
+                               + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_RETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_RETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_RETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_READ_REQUEST]                        = {
+               .name   = "IB_OPCODE_RD_RDMA_READ_REQUEST",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+                               | RXE_REQ_MASK | RXE_READ_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_RETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RETH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_RDETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST]         = {
+               .name   = "IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST",
+               .mask   = RXE_RDETH_MASK | RXE_AETH_MASK
+                               | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+                               | RXE_START_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_AETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE]                = {
+               .name   = "IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE",
+               .mask   = RXE_RDETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+                               | RXE_MIDDLE_MASK,
+               .length = RXE_BTH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST]          = {
+               .name   = "IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST",
+               .mask   = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_ACK_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_AETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY]          = {
+               .name   = "IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY",
+               .mask   = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK
+                               | RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_AETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_ACKNOWLEDGE]                      = {
+               .name   = "IB_OPCODE_RD_ACKNOWLEDGE",
+               .mask   = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ACK_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_AETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE]                       = {
+               .name   = "IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE",
+               .mask   = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ATMACK_MASK
+                               | RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_AETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_ATMACK]    = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_AETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_COMPARE_SWAP]                     = {
+               .name   = "RD_COMPARE_SWAP",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK
+                               | RXE_REQ_MASK | RXE_ATOMIC_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_ATMETH]    = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES +
+                                               + RXE_ATMETH_BYTES
+                                               + RXE_DETH_BYTES +
+                                               + RXE_RDETH_BYTES,
+               }
+       },
+       [IB_OPCODE_RD_FETCH_ADD]                        = {
+               .name   = "IB_OPCODE_RD_FETCH_ADD",
+               .mask   = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK
+                               | RXE_REQ_MASK | RXE_ATOMIC_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES
+                               + RXE_RDETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_RDETH]     = RXE_BTH_BYTES,
+                       [RXE_DETH]      = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES,
+                       [RXE_ATMETH]    = RXE_BTH_BYTES
+                                               + RXE_RDETH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES +
+                                               + RXE_ATMETH_BYTES
+                                               + RXE_DETH_BYTES +
+                                               + RXE_RDETH_BYTES,
+               }
+       },
+
+       /* UD */
+       [IB_OPCODE_UD_SEND_ONLY]                        = {
+               .name   = "IB_OPCODE_UD_SEND_ONLY",
+               .mask   = RXE_DETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+                               | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+                               | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_DETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_DETH]      = RXE_BTH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_DETH_BYTES,
+               }
+       },
+       [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]         = {
+               .name   = "IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE",
+               .mask   = RXE_DETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+                               | RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+                               | RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK,
+               .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES,
+               .offset = {
+                       [RXE_BTH]       = 0,
+                       [RXE_DETH]      = RXE_BTH_BYTES,
+                       [RXE_IMMDT]     = RXE_BTH_BYTES
+                                               + RXE_DETH_BYTES,
+                       [RXE_PAYLOAD]   = RXE_BTH_BYTES
+                                               + RXE_DETH_BYTES
+                                               + RXE_IMMDT_BYTES,
+               }
+       },
+
+};
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h

new file mode 100644 (file)

index 0000000..307604e
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_opcode.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_OPCODE_H
+#define RXE_OPCODE_H
+
+/*
+ * contains header bit mask definitions and header lengths
+ * declaration of the rxe_opcode_info struct and
+ * rxe_wr_opcode_info struct
+ */
+
+enum rxe_wr_mask {
+       WR_INLINE_MASK                  = BIT(0),
+       WR_ATOMIC_MASK                  = BIT(1),
+       WR_SEND_MASK                    = BIT(2),
+       WR_READ_MASK                    = BIT(3),
+       WR_WRITE_MASK                   = BIT(4),
+       WR_LOCAL_MASK                   = BIT(5),
+       WR_REG_MASK                     = BIT(6),
+
+       WR_READ_OR_WRITE_MASK           = WR_READ_MASK | WR_WRITE_MASK,
+       WR_READ_WRITE_OR_SEND_MASK      = WR_READ_OR_WRITE_MASK | WR_SEND_MASK,
+       WR_WRITE_OR_SEND_MASK           = WR_WRITE_MASK | WR_SEND_MASK,
+       WR_ATOMIC_OR_READ_MASK          = WR_ATOMIC_MASK | WR_READ_MASK,
+};
+
+#define WR_MAX_QPT             (8)
+
+struct rxe_wr_opcode_info {
+       char                    *name;
+       enum rxe_wr_mask        mask[WR_MAX_QPT];
+};
+
+extern struct rxe_wr_opcode_info rxe_wr_opcode_info[];
+
+enum rxe_hdr_type {
+       RXE_LRH,
+       RXE_GRH,
+       RXE_BTH,
+       RXE_RETH,
+       RXE_AETH,
+       RXE_ATMETH,
+       RXE_ATMACK,
+       RXE_IETH,
+       RXE_RDETH,
+       RXE_DETH,
+       RXE_IMMDT,
+       RXE_PAYLOAD,
+       NUM_HDR_TYPES
+};
+
+enum rxe_hdr_mask {
+       RXE_LRH_MASK            = BIT(RXE_LRH),
+       RXE_GRH_MASK            = BIT(RXE_GRH),
+       RXE_BTH_MASK            = BIT(RXE_BTH),
+       RXE_IMMDT_MASK          = BIT(RXE_IMMDT),
+       RXE_RETH_MASK           = BIT(RXE_RETH),
+       RXE_AETH_MASK           = BIT(RXE_AETH),
+       RXE_ATMETH_MASK         = BIT(RXE_ATMETH),
+       RXE_ATMACK_MASK         = BIT(RXE_ATMACK),
+       RXE_IETH_MASK           = BIT(RXE_IETH),
+       RXE_RDETH_MASK          = BIT(RXE_RDETH),
+       RXE_DETH_MASK           = BIT(RXE_DETH),
+       RXE_PAYLOAD_MASK        = BIT(RXE_PAYLOAD),
+
+       RXE_REQ_MASK            = BIT(NUM_HDR_TYPES + 0),
+       RXE_ACK_MASK            = BIT(NUM_HDR_TYPES + 1),
+       RXE_SEND_MASK           = BIT(NUM_HDR_TYPES + 2),
+       RXE_WRITE_MASK          = BIT(NUM_HDR_TYPES + 3),
+       RXE_READ_MASK           = BIT(NUM_HDR_TYPES + 4),
+       RXE_ATOMIC_MASK         = BIT(NUM_HDR_TYPES + 5),
+
+       RXE_RWR_MASK            = BIT(NUM_HDR_TYPES + 6),
+       RXE_COMP_MASK           = BIT(NUM_HDR_TYPES + 7),
+
+       RXE_START_MASK          = BIT(NUM_HDR_TYPES + 8),
+       RXE_MIDDLE_MASK         = BIT(NUM_HDR_TYPES + 9),
+       RXE_END_MASK            = BIT(NUM_HDR_TYPES + 10),
+
+       RXE_LOOPBACK_MASK       = BIT(NUM_HDR_TYPES + 12),
+
+       RXE_READ_OR_ATOMIC      = (RXE_READ_MASK | RXE_ATOMIC_MASK),
+       RXE_WRITE_OR_SEND       = (RXE_WRITE_MASK | RXE_SEND_MASK),
+};
+
+#define OPCODE_NONE            (-1)
+#define RXE_NUM_OPCODE         256
+
+struct rxe_opcode_info {
+       char                    *name;
+       enum rxe_hdr_mask       mask;
+       int                     length;
+       int                     offset[NUM_HDR_TYPES];
+};
+
+extern struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE];
+
+#endif /* RXE_OPCODE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h

new file mode 100644 (file)

index 0000000..f459c43
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_param.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_PARAM_H
+#define RXE_PARAM_H
+
+static inline enum ib_mtu rxe_mtu_int_to_enum(int mtu)
+{
+       if (mtu < 256)
+               return 0;
+       else if (mtu < 512)
+               return IB_MTU_256;
+       else if (mtu < 1024)
+               return IB_MTU_512;
+       else if (mtu < 2048)
+               return IB_MTU_1024;
+       else if (mtu < 4096)
+               return IB_MTU_2048;
+       else
+               return IB_MTU_4096;
+}
+
+/* Find the IB mtu for a given network MTU. */
+static inline enum ib_mtu eth_mtu_int_to_enum(int mtu)
+{
+       mtu -= RXE_MAX_HDR_LENGTH;
+
+       return rxe_mtu_int_to_enum(mtu);
+}
+
+/* default/initial rxe device parameter settings */
+enum rxe_device_param {
+       RXE_FW_VER                      = 0,
+       RXE_MAX_MR_SIZE                 = -1ull,
+       RXE_PAGE_SIZE_CAP               = 0xfffff000,
+       RXE_VENDOR_ID                   = 0,
+       RXE_VENDOR_PART_ID              = 0,
+       RXE_HW_VER                      = 0,
+       RXE_MAX_QP                      = 0x10000,
+       RXE_MAX_QP_WR                   = 0x4000,
+       RXE_MAX_INLINE_DATA             = 400,
+       RXE_DEVICE_CAP_FLAGS            = IB_DEVICE_BAD_PKEY_CNTR
+                                       | IB_DEVICE_BAD_QKEY_CNTR
+                                       | IB_DEVICE_AUTO_PATH_MIG
+                                       | IB_DEVICE_CHANGE_PHY_PORT
+                                       | IB_DEVICE_UD_AV_PORT_ENFORCE
+                                       | IB_DEVICE_PORT_ACTIVE_EVENT
+                                       | IB_DEVICE_SYS_IMAGE_GUID
+                                       | IB_DEVICE_RC_RNR_NAK_GEN
+                                       | IB_DEVICE_SRQ_RESIZE
+                                       | IB_DEVICE_MEM_MGT_EXTENSIONS,
+       RXE_MAX_SGE                     = 32,
+       RXE_MAX_SGE_RD                  = 32,
+       RXE_MAX_CQ                      = 16384,
+       RXE_MAX_LOG_CQE                 = 13,
+       RXE_MAX_MR                      = 2 * 1024,
+       RXE_MAX_PD                      = 0x7ffc,
+       RXE_MAX_QP_RD_ATOM              = 128,
+       RXE_MAX_EE_RD_ATOM              = 0,
+       RXE_MAX_RES_RD_ATOM             = 0x3f000,
+       RXE_MAX_QP_INIT_RD_ATOM         = 128,
+       RXE_MAX_EE_INIT_RD_ATOM         = 0,
+       RXE_ATOMIC_CAP                  = 1,
+       RXE_MAX_EE                      = 0,
+       RXE_MAX_RDD                     = 0,
+       RXE_MAX_MW                      = 0,
+       RXE_MAX_RAW_IPV6_QP             = 0,
+       RXE_MAX_RAW_ETHY_QP             = 0,
+       RXE_MAX_MCAST_GRP               = 8192,
+       RXE_MAX_MCAST_QP_ATTACH         = 56,
+       RXE_MAX_TOT_MCAST_QP_ATTACH     = 0x70000,
+       RXE_MAX_AH                      = 100,
+       RXE_MAX_FMR                     = 0,
+       RXE_MAX_MAP_PER_FMR             = 0,
+       RXE_MAX_SRQ                     = 960,
+       RXE_MAX_SRQ_WR                  = 0x4000,
+       RXE_MIN_SRQ_WR                  = 1,
+       RXE_MAX_SRQ_SGE                 = 27,
+       RXE_MIN_SRQ_SGE                 = 1,
+       RXE_MAX_FMR_PAGE_LIST_LEN       = 512,
+       RXE_MAX_PKEYS                   = 64,
+       RXE_LOCAL_CA_ACK_DELAY          = 15,
+
+       RXE_MAX_UCONTEXT                = 512,
+
+       RXE_NUM_PORT                    = 1,
+       RXE_NUM_COMP_VECTORS            = 1,
+
+       RXE_MIN_QP_INDEX                = 16,
+       RXE_MAX_QP_INDEX                = 0x00020000,
+
+       RXE_MIN_SRQ_INDEX               = 0x00020001,
+       RXE_MAX_SRQ_INDEX               = 0x00040000,
+
+       RXE_MIN_MR_INDEX                = 0x00000001,
+       RXE_MAX_MR_INDEX                = 0x00040000,
+       RXE_MIN_MW_INDEX                = 0x00040001,
+       RXE_MAX_MW_INDEX                = 0x00060000,
+       RXE_MAX_PKT_PER_ACK             = 64,
+
+       RXE_MAX_UNACKED_PSNS            = 128,
+
+       /* Max inflight SKBs per queue pair */
+       RXE_INFLIGHT_SKBS_PER_QP_HIGH   = 64,
+       RXE_INFLIGHT_SKBS_PER_QP_LOW    = 16,
+
+       /* Delay before calling arbiter timer */
+       RXE_NSEC_ARB_TIMER_DELAY        = 200,
+};
+
+/* default/initial rxe port parameters */
+enum rxe_port_param {
+       RXE_PORT_STATE                  = IB_PORT_DOWN,
+       RXE_PORT_MAX_MTU                = IB_MTU_4096,
+       RXE_PORT_ACTIVE_MTU             = IB_MTU_256,
+       RXE_PORT_GID_TBL_LEN            = 1024,
+       RXE_PORT_PORT_CAP_FLAGS         = RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP,
+       RXE_PORT_MAX_MSG_SZ             = 0x800000,
+       RXE_PORT_BAD_PKEY_CNTR          = 0,
+       RXE_PORT_QKEY_VIOL_CNTR         = 0,
+       RXE_PORT_LID                    = 0,
+       RXE_PORT_SM_LID                 = 0,
+       RXE_PORT_SM_SL                  = 0,
+       RXE_PORT_LMC                    = 0,
+       RXE_PORT_MAX_VL_NUM             = 1,
+       RXE_PORT_SUBNET_TIMEOUT         = 0,
+       RXE_PORT_INIT_TYPE_REPLY        = 0,
+       RXE_PORT_ACTIVE_WIDTH           = IB_WIDTH_1X,
+       RXE_PORT_ACTIVE_SPEED           = 1,
+       RXE_PORT_PKEY_TBL_LEN           = 64,
+       RXE_PORT_PHYS_STATE             = 2,
+       RXE_PORT_SUBNET_PREFIX          = 0xfe80000000000000ULL,
+};
+
+/* default/initial port info parameters */
+enum rxe_port_info_param {
+       RXE_PORT_INFO_VL_CAP            = 4,    /* 1-8 */
+       RXE_PORT_INFO_MTU_CAP           = 5,    /* 4096 */
+       RXE_PORT_INFO_OPER_VL           = 1,    /* 1 */
+};
+
+#endif /* RXE_PARAM_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c

new file mode 100644 (file)

index 0000000..6bac071
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -0,0 +1,502 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *             - Redistributions of source code must retain the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer.
+ *
+ *             - Redistributions in binary form must reproduce the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer in the documentation and/or other materials
+ *               provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/* info about object pools
+ * note that mr and mw share a single index space
+ * so that one can map an lkey to the correct type of object
+ */
+struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
+       [RXE_TYPE_UC] = {
+               .name           = "rxe-uc",
+               .size           = sizeof(struct rxe_ucontext),
+       },
+       [RXE_TYPE_PD] = {
+               .name           = "rxe-pd",
+               .size           = sizeof(struct rxe_pd),
+       },
+       [RXE_TYPE_AH] = {
+               .name           = "rxe-ah",
+               .size           = sizeof(struct rxe_ah),
+               .flags          = RXE_POOL_ATOMIC,
+       },
+       [RXE_TYPE_SRQ] = {
+               .name           = "rxe-srq",
+               .size           = sizeof(struct rxe_srq),
+               .flags          = RXE_POOL_INDEX,
+               .min_index      = RXE_MIN_SRQ_INDEX,
+               .max_index      = RXE_MAX_SRQ_INDEX,
+       },
+       [RXE_TYPE_QP] = {
+               .name           = "rxe-qp",
+               .size           = sizeof(struct rxe_qp),
+               .cleanup        = rxe_qp_cleanup,
+               .flags          = RXE_POOL_INDEX,
+               .min_index      = RXE_MIN_QP_INDEX,
+               .max_index      = RXE_MAX_QP_INDEX,
+       },
+       [RXE_TYPE_CQ] = {
+               .name           = "rxe-cq",
+               .size           = sizeof(struct rxe_cq),
+               .cleanup        = rxe_cq_cleanup,
+       },
+       [RXE_TYPE_MR] = {
+               .name           = "rxe-mr",
+               .size           = sizeof(struct rxe_mem),
+               .cleanup        = rxe_mem_cleanup,
+               .flags          = RXE_POOL_INDEX,
+               .max_index      = RXE_MAX_MR_INDEX,
+               .min_index      = RXE_MIN_MR_INDEX,
+       },
+       [RXE_TYPE_MW] = {
+               .name           = "rxe-mw",
+               .size           = sizeof(struct rxe_mem),
+               .flags          = RXE_POOL_INDEX,
+               .max_index      = RXE_MAX_MW_INDEX,
+               .min_index      = RXE_MIN_MW_INDEX,
+       },
+       [RXE_TYPE_MC_GRP] = {
+               .name           = "rxe-mc_grp",
+               .size           = sizeof(struct rxe_mc_grp),
+               .cleanup        = rxe_mc_cleanup,
+               .flags          = RXE_POOL_KEY,
+               .key_offset     = offsetof(struct rxe_mc_grp, mgid),
+               .key_size       = sizeof(union ib_gid),
+       },
+       [RXE_TYPE_MC_ELEM] = {
+               .name           = "rxe-mc_elem",
+               .size           = sizeof(struct rxe_mc_elem),
+               .flags          = RXE_POOL_ATOMIC,
+       },
+};
+
+static inline char *pool_name(struct rxe_pool *pool)
+{
+       return rxe_type_info[pool->type].name;
+}
+
+static inline struct kmem_cache *pool_cache(struct rxe_pool *pool)
+{
+       return rxe_type_info[pool->type].cache;
+}
+
+static inline enum rxe_elem_type rxe_type(void *arg)
+{
+       struct rxe_pool_entry *elem = arg;
+
+       return elem->pool->type;
+}
+
+int rxe_cache_init(void)
+{
+       int err;
+       int i;
+       size_t size;
+       struct rxe_type_info *type;
+
+       for (i = 0; i < RXE_NUM_TYPES; i++) {
+               type = &rxe_type_info[i];
+               size = ALIGN(type->size, RXE_POOL_ALIGN);
+               type->cache = kmem_cache_create(type->name, size,
+                               RXE_POOL_ALIGN,
+                               RXE_POOL_CACHE_FLAGS, NULL);
+               if (!type->cache) {
+                       pr_err("Unable to init kmem cache for %s\n",
+                              type->name);
+                       err = -ENOMEM;
+                       goto err1;
+               }
+       }
+
+       return 0;
+
+err1:
+       while (--i >= 0) {
+               kmem_cache_destroy(type->cache);
+               type->cache = NULL;
+       }
+
+       return err;
+}
+
+void rxe_cache_exit(void)
+{
+       int i;
+       struct rxe_type_info *type;
+
+       for (i = 0; i < RXE_NUM_TYPES; i++) {
+               type = &rxe_type_info[i];
+               kmem_cache_destroy(type->cache);
+               type->cache = NULL;
+       }
+}
+
+static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min)
+{
+       int err = 0;
+       size_t size;
+
+       if ((max - min + 1) < pool->max_elem) {
+               pr_warn("not enough indices for max_elem\n");
+               err = -EINVAL;
+               goto out;
+       }
+
+       pool->max_index = max;
+       pool->min_index = min;
+
+       size = BITS_TO_LONGS(max - min + 1) * sizeof(long);
+       pool->table = kmalloc(size, GFP_KERNEL);
+       if (!pool->table) {
+               pr_warn("no memory for bit table\n");
+               err = -ENOMEM;
+               goto out;
+       }
+
+       pool->table_size = size;
+       bitmap_zero(pool->table, max - min + 1);
+
+out:
+       return err;
+}
+
+int rxe_pool_init(
+       struct rxe_dev          *rxe,
+       struct rxe_pool         *pool,
+       enum rxe_elem_type      type,
+       unsigned                max_elem)
+{
+       int                     err = 0;
+       size_t                  size = rxe_type_info[type].size;
+
+       memset(pool, 0, sizeof(*pool));
+
+       pool->rxe               = rxe;
+       pool->type              = type;
+       pool->max_elem          = max_elem;
+       pool->elem_size         = ALIGN(size, RXE_POOL_ALIGN);
+       pool->flags             = rxe_type_info[type].flags;
+       pool->tree              = RB_ROOT;
+       pool->cleanup           = rxe_type_info[type].cleanup;
+
+       atomic_set(&pool->num_elem, 0);
+
+       kref_init(&pool->ref_cnt);
+
+       spin_lock_init(&pool->pool_lock);
+
+       if (rxe_type_info[type].flags & RXE_POOL_INDEX) {
+               err = rxe_pool_init_index(pool,
+                                         rxe_type_info[type].max_index,
+                                         rxe_type_info[type].min_index);
+               if (err)
+                       goto out;
+       }
+
+       if (rxe_type_info[type].flags & RXE_POOL_KEY) {
+               pool->key_offset = rxe_type_info[type].key_offset;
+               pool->key_size = rxe_type_info[type].key_size;
+       }
+
+       pool->state = rxe_pool_valid;
+
+out:
+       return err;
+}
+
+static void rxe_pool_release(struct kref *kref)
+{
+       struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt);
+
+       pool->state = rxe_pool_invalid;
+       kfree(pool->table);
+}
+
+static void rxe_pool_put(struct rxe_pool *pool)
+{
+       kref_put(&pool->ref_cnt, rxe_pool_release);
+}
+
+int rxe_pool_cleanup(struct rxe_pool *pool)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+       pool->state = rxe_pool_invalid;
+       if (atomic_read(&pool->num_elem) > 0)
+               pr_warn("%s pool destroyed with unfree'd elem\n",
+                       pool_name(pool));
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+       rxe_pool_put(pool);
+
+       return 0;
+}
+
+static u32 alloc_index(struct rxe_pool *pool)
+{
+       u32 index;
+       u32 range = pool->max_index - pool->min_index + 1;
+
+       index = find_next_zero_bit(pool->table, range, pool->last);
+       if (index >= range)
+               index = find_first_zero_bit(pool->table, range);
+
+       set_bit(index, pool->table);
+       pool->last = index;
+       return index + pool->min_index;
+}
+
+static void insert_index(struct rxe_pool *pool, struct rxe_pool_entry *new)
+{
+       struct rb_node **link = &pool->tree.rb_node;
+       struct rb_node *parent = NULL;
+       struct rxe_pool_entry *elem;
+
+       while (*link) {
+               parent = *link;
+               elem = rb_entry(parent, struct rxe_pool_entry, node);
+
+               if (elem->index == new->index) {
+                       pr_warn("element already exists!\n");
+                       goto out;
+               }
+
+               if (elem->index > new->index)
+                       link = &(*link)->rb_left;
+               else
+                       link = &(*link)->rb_right;
+       }
+
+       rb_link_node(&new->node, parent, link);
+       rb_insert_color(&new->node, &pool->tree);
+out:
+       return;
+}
+
+static void insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new)
+{
+       struct rb_node **link = &pool->tree.rb_node;
+       struct rb_node *parent = NULL;
+       struct rxe_pool_entry *elem;
+       int cmp;
+
+       while (*link) {
+               parent = *link;
+               elem = rb_entry(parent, struct rxe_pool_entry, node);
+
+               cmp = memcmp((u8 *)elem + pool->key_offset,
+                            (u8 *)new + pool->key_offset, pool->key_size);
+
+               if (cmp == 0) {
+                       pr_warn("key already exists!\n");
+                       goto out;
+               }
+
+               if (cmp > 0)
+                       link = &(*link)->rb_left;
+               else
+                       link = &(*link)->rb_right;
+       }
+
+       rb_link_node(&new->node, parent, link);
+       rb_insert_color(&new->node, &pool->tree);
+out:
+       return;
+}
+
+void rxe_add_key(void *arg, void *key)
+{
+       struct rxe_pool_entry *elem = arg;
+       struct rxe_pool *pool = elem->pool;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+       memcpy((u8 *)elem + pool->key_offset, key, pool->key_size);
+       insert_key(pool, elem);
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_drop_key(void *arg)
+{
+       struct rxe_pool_entry *elem = arg;
+       struct rxe_pool *pool = elem->pool;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+       rb_erase(&elem->node, &pool->tree);
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_add_index(void *arg)
+{
+       struct rxe_pool_entry *elem = arg;
+       struct rxe_pool *pool = elem->pool;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+       elem->index = alloc_index(pool);
+       insert_index(pool, elem);
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_drop_index(void *arg)
+{
+       struct rxe_pool_entry *elem = arg;
+       struct rxe_pool *pool = elem->pool;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+       clear_bit(elem->index - pool->min_index, pool->table);
+       rb_erase(&elem->node, &pool->tree);
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void *rxe_alloc(struct rxe_pool *pool)
+{
+       struct rxe_pool_entry *elem;
+       unsigned long flags;
+
+       might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC));
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+       if (pool->state != rxe_pool_valid) {
+               spin_unlock_irqrestore(&pool->pool_lock, flags);
+               return NULL;
+       }
+       kref_get(&pool->ref_cnt);
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+       kref_get(&pool->rxe->ref_cnt);
+
+       if (atomic_inc_return(&pool->num_elem) > pool->max_elem) {
+               atomic_dec(&pool->num_elem);
+               rxe_dev_put(pool->rxe);
+               rxe_pool_put(pool);
+               return NULL;
+       }
+
+       elem = kmem_cache_zalloc(pool_cache(pool),
+                                (pool->flags & RXE_POOL_ATOMIC) ?
+                                GFP_ATOMIC : GFP_KERNEL);
+
+       elem->pool = pool;
+       kref_init(&elem->ref_cnt);
+
+       return elem;
+}
+
+void rxe_elem_release(struct kref *kref)
+{
+       struct rxe_pool_entry *elem =
+               container_of(kref, struct rxe_pool_entry, ref_cnt);
+       struct rxe_pool *pool = elem->pool;
+
+       if (pool->cleanup)
+               pool->cleanup(elem);
+
+       kmem_cache_free(pool_cache(pool), elem);
+       atomic_dec(&pool->num_elem);
+       rxe_dev_put(pool->rxe);
+       rxe_pool_put(pool);
+}
+
+void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
+{
+       struct rb_node *node = NULL;
+       struct rxe_pool_entry *elem = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+
+       if (pool->state != rxe_pool_valid)
+               goto out;
+
+       node = pool->tree.rb_node;
+
+       while (node) {
+               elem = rb_entry(node, struct rxe_pool_entry, node);
+
+               if (elem->index > index)
+                       node = node->rb_left;
+               else if (elem->index < index)
+                       node = node->rb_right;
+               else
+                       break;
+       }
+
+       if (node)
+               kref_get(&elem->ref_cnt);
+
+out:
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+       return node ? (void *)elem : NULL;
+}
+
+void *rxe_pool_get_key(struct rxe_pool *pool, void *key)
+{
+       struct rb_node *node = NULL;
+       struct rxe_pool_entry *elem = NULL;
+       int cmp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool->pool_lock, flags);
+
+       if (pool->state != rxe_pool_valid)
+               goto out;
+
+       node = pool->tree.rb_node;
+
+       while (node) {
+               elem = rb_entry(node, struct rxe_pool_entry, node);
+
+               cmp = memcmp((u8 *)elem + pool->key_offset,
+                            key, pool->key_size);
+
+               if (cmp > 0)
+                       node = node->rb_left;
+               else if (cmp < 0)
+                       node = node->rb_right;
+               else
+                       break;
+       }
+
+       if (node)
+               kref_get(&elem->ref_cnt);
+
+out:
+       spin_unlock_irqrestore(&pool->pool_lock, flags);
+       return node ? ((void *)elem) : NULL;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h

new file mode 100644 (file)

index 0000000..4d04830
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *             - Redistributions of source code must retain the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer.
+ *
+ *             - Redistributions in binary form must reproduce the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer in the documentation and/or other materials
+ *               provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_POOL_H
+#define RXE_POOL_H
+
+#define RXE_POOL_ALIGN         (16)
+#define RXE_POOL_CACHE_FLAGS   (0)
+
+enum rxe_pool_flags {
+       RXE_POOL_ATOMIC         = BIT(0),
+       RXE_POOL_INDEX          = BIT(1),
+       RXE_POOL_KEY            = BIT(2),
+};
+
+enum rxe_elem_type {
+       RXE_TYPE_UC,
+       RXE_TYPE_PD,
+       RXE_TYPE_AH,
+       RXE_TYPE_SRQ,
+       RXE_TYPE_QP,
+       RXE_TYPE_CQ,
+       RXE_TYPE_MR,
+       RXE_TYPE_MW,
+       RXE_TYPE_MC_GRP,
+       RXE_TYPE_MC_ELEM,
+       RXE_NUM_TYPES,          /* keep me last */
+};
+
+struct rxe_type_info {
+       char                    *name;
+       size_t                  size;
+       void                    (*cleanup)(void *obj);
+       enum rxe_pool_flags     flags;
+       u32                     max_index;
+       u32                     min_index;
+       size_t                  key_offset;
+       size_t                  key_size;
+       struct kmem_cache       *cache;
+};
+
+extern struct rxe_type_info rxe_type_info[];
+
+enum rxe_pool_state {
+       rxe_pool_invalid,
+       rxe_pool_valid,
+};
+
+struct rxe_pool_entry {
+       struct rxe_pool         *pool;
+       struct kref             ref_cnt;
+       struct list_head        list;
+
+       /* only used if indexed or keyed */
+       struct rb_node          node;
+       u32                     index;
+};
+
+struct rxe_pool {
+       struct rxe_dev          *rxe;
+       spinlock_t              pool_lock; /* pool spinlock */
+       size_t                  elem_size;
+       struct kref             ref_cnt;
+       void                    (*cleanup)(void *obj);
+       enum rxe_pool_state     state;
+       enum rxe_pool_flags     flags;
+       enum rxe_elem_type      type;
+
+       unsigned int            max_elem;
+       atomic_t                num_elem;
+
+       /* only used if indexed or keyed */
+       struct rb_root          tree;
+       unsigned long           *table;
+       size_t                  table_size;
+       u32                     max_index;
+       u32                     min_index;
+       u32                     last;
+       size_t                  key_offset;
+       size_t                  key_size;
+};
+
+/* initialize slab caches for managed objects */
+int rxe_cache_init(void);
+
+/* cleanup slab caches for managed objects */
+void rxe_cache_exit(void);
+
+/* initialize a pool of objects with given limit on
+ * number of elements. gets parameters from rxe_type_info
+ * pool elements will be allocated out of a slab cache
+ */
+int rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool,
+                 enum rxe_elem_type type, u32 max_elem);
+
+/* free resources from object pool */
+int rxe_pool_cleanup(struct rxe_pool *pool);
+
+/* allocate an object from pool */
+void *rxe_alloc(struct rxe_pool *pool);
+
+/* assign an index to an indexed object and insert object into
+ *  pool's rb tree
+ */
+void rxe_add_index(void *elem);
+
+/* drop an index and remove object from rb tree */
+void rxe_drop_index(void *elem);
+
+/* assign a key to a keyed object and insert object into
+ *  pool's rb tree
+ */
+void rxe_add_key(void *elem, void *key);
+
+/* remove elem from rb tree */
+void rxe_drop_key(void *elem);
+
+/* lookup an indexed object from index. takes a reference on object */
+void *rxe_pool_get_index(struct rxe_pool *pool, u32 index);
+
+/* lookup keyed object from key. takes a reference on the object */
+void *rxe_pool_get_key(struct rxe_pool *pool, void *key);
+
+/* cleanup an object when all references are dropped */
+void rxe_elem_release(struct kref *kref);
+
+/* take a reference on an object */
+#define rxe_add_ref(elem) kref_get(&(elem)->pelem.ref_cnt)
+
+/* drop a reference on an object */
+#define rxe_drop_ref(elem) kref_put(&(elem)->pelem.ref_cnt, rxe_elem_release)
+
+#endif /* RXE_POOL_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c

new file mode 100644 (file)

index 0000000..22ba24f
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *             - Redistributions of source code must retain the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer.
+ *
+ *             - Redistributions in binary form must reproduce the above
+ *               copyright notice, this list of conditions and the following
+ *               disclaimer in the documentation and/or other materials
+ *               provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+#include "rxe_task.h"
+
+char *rxe_qp_state_name[] = {
+       [QP_STATE_RESET]        = "RESET",
+       [QP_STATE_INIT]         = "INIT",
+       [QP_STATE_READY]        = "READY",
+       [QP_STATE_DRAIN]        = "DRAIN",
+       [QP_STATE_DRAINED]      = "DRAINED",
+       [QP_STATE_ERROR]        = "ERROR",
+};
+
+static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap,
+                         int has_srq)
+{
+       if (cap->max_send_wr > rxe->attr.max_qp_wr) {
+               pr_warn("invalid send wr = %d > %d\n",
+                       cap->max_send_wr, rxe->attr.max_qp_wr);
+               goto err1;
+       }
+
+       if (cap->max_send_sge > rxe->attr.max_sge) {
+               pr_warn("invalid send sge = %d > %d\n",
+                       cap->max_send_sge, rxe->attr.max_sge);
+               goto err1;
+       }
+
+       if (!has_srq) {
+               if (cap->max_recv_wr > rxe->attr.max_qp_wr) {
+                       pr_warn("invalid recv wr = %d > %d\n",
+                               cap->max_recv_wr, rxe->attr.max_qp_wr);
+                       goto err1;
+               }
+
+               if (cap->max_recv_sge > rxe->attr.max_sge) {
+                       pr_warn("invalid recv sge = %d > %d\n",
+                               cap->max_recv_sge, rxe->attr.max_sge);
+                       goto err1;
+               }
+       }
+
+       if (cap->max_inline_data > rxe->max_inline_data) {
+               pr_warn("invalid max inline data = %d > %d\n",
+                       cap->max_inline_data, rxe->max_inline_data);
+               goto err1;
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init)
+{
+       struct ib_qp_cap *cap = &init->cap;
+       struct rxe_port *port;
+       int port_num = init->port_num;
+
+       if (!init->recv_cq || !init->send_cq) {
+               pr_warn("missing cq\n");
+               goto err1;
+       }
+
+       if (rxe_qp_chk_cap(rxe, cap, !!init->srq))
+               goto err1;
+
+       if (init->qp_type == IB_QPT_SMI || init->qp_type == IB_QPT_GSI) {
+               if (port_num != 1) {
+                       pr_warn("invalid port = %d\n", port_num);
+                       goto err1;
+               }
+
+               port = &rxe->port;
+
+               if (init->qp_type == IB_QPT_SMI && port->qp_smi_index) {
+                       pr_warn("SMI QP exists for port %d\n", port_num);
+                       goto err1;
+               }
+
+               if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) {
+                       pr_warn("GSI QP exists for port %d\n", port_num);
+                       goto err1;
+               }
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static int alloc_rd_atomic_resources(struct rxe_qp *qp, unsigned int n)
+{
+       qp->resp.res_head = 0;
+       qp->resp.res_tail = 0;
+       qp->resp.resources = kcalloc(n, sizeof(struct resp_res), GFP_KERNEL);
+
+       if (!qp->resp.resources)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static void free_rd_atomic_resources(struct rxe_qp *qp)
+{
+       if (qp->resp.resources) {
+               int i;
+
+               for (i = 0; i < qp->attr.max_rd_atomic; i++) {
+                       struct resp_res *res = &qp->resp.resources[i];
+
+                       free_rd_atomic_resource(qp, res);
+               }
+               kfree(qp->resp.resources);
+               qp->resp.resources = NULL;
+       }
+}
+
+void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res)
+{
+       if (res->type == RXE_ATOMIC_MASK) {
+               rxe_drop_ref(qp);
+               kfree_skb(res->atomic.skb);
+       } else if (res->type == RXE_READ_MASK) {
+               if (res->read.mr)
+                       rxe_drop_ref(res->read.mr);
+       }
+       res->type = 0;
+}
+
+static void cleanup_rd_atomic_resources(struct rxe_qp *qp)
+{
+       int i;
+       struct resp_res *res;
+
+       if (qp->resp.resources) {
+               for (i = 0; i < qp->attr.max_rd_atomic; i++) {
+                       res = &qp->resp.resources[i];
+                       free_rd_atomic_resource(qp, res);
+               }
+       }
+}
+
+static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp,
+                            struct ib_qp_init_attr *init)
+{
+       struct rxe_port *port;
+       u32 qpn;
+
+       qp->sq_sig_type         = init->sq_sig_type;
+       qp->attr.path_mtu       = 1;
+       qp->mtu                 = ib_mtu_enum_to_int(qp->attr.path_mtu);
+
+       qpn                     = qp->pelem.index;
+       port                    = &rxe->port;
+
+       switch (init->qp_type) {
+       case IB_QPT_SMI:
+               qp->ibqp.qp_num         = 0;
+               port->qp_smi_index      = qpn;
+               qp->attr.port_num       = init->port_num;
+               break;
+
+       case IB_QPT_GSI:
+               qp->ibqp.qp_num         = 1;
+               port->qp_gsi_index      = qpn;
+               qp->attr.port_num       = init->port_num;
+               break;
+
+       default:
+               qp->ibqp.qp_num         = qpn;
+               break;
+       }
+
+       INIT_LIST_HEAD(&qp->grp_list);
+
+       skb_queue_head_init(&qp->send_pkts);
+
+       spin_lock_init(&qp->grp_lock);
+       spin_lock_init(&qp->state_lock);
+
+       atomic_set(&qp->ssn, 0);
+       atomic_set(&qp->skb_out, 0);
+}
+
+static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
+                          struct ib_qp_init_attr *init,
+                          struct ib_ucontext *context, struct ib_udata *udata)
+{
+       int err;
+       int wqe_size;
+
+       err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk);
+       if (err < 0)
+               return err;
+       qp->sk->sk->sk_user_data = qp;
+
+       qp->sq.max_wr           = init->cap.max_send_wr;
+       qp->sq.max_sge          = init->cap.max_send_sge;
+       qp->sq.max_inline       = init->cap.max_inline_data;
+
+       wqe_size = max_t(int, sizeof(struct rxe_send_wqe) +
+                        qp->sq.max_sge * sizeof(struct ib_sge),
+                        sizeof(struct rxe_send_wqe) +
+                        qp->sq.max_inline);
+
+       qp->sq.queue = rxe_queue_init(rxe,
+                                     &qp->sq.max_wr,
+                                     wqe_size);
+       if (!qp->sq.queue)
+               return -ENOMEM;
+
+       err = do_mmap_info(rxe, udata, true,
+                          context, qp->sq.queue->buf,
+                          qp->sq.queue->buf_size, &qp->sq.queue->ip);
+
+       if (err) {
+               kvfree(qp->sq.queue->buf);
+               kfree(qp->sq.queue);
+               return err;
+       }
+
+       qp->req.wqe_index       = producer_index(qp->sq.queue);
+       qp->req.state           = QP_STATE_RESET;
+       qp->req.opcode          = -1;
+       qp->comp.opcode         = -1;
+
+       spin_lock_init(&qp->sq.sq_lock);
+       skb_queue_head_init(&qp->req_pkts);
+
+       rxe_init_task(rxe, &qp->req.task, qp,
+                     rxe_requester, "req");
+       rxe_init_task(rxe, &qp->comp.task, qp,
+                     rxe_completer, "comp");
+
+       init_timer(&qp->rnr_nak_timer);
+       qp->rnr_nak_timer.function = rnr_nak_timer;
+       qp->rnr_nak_timer.data = (unsigned long)qp;
+
+       init_timer(&qp->retrans_timer);
+       qp->retrans_timer.function = retransmit_timer;
+       qp->retrans_timer.data = (unsigned long)qp;
+       qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */
+
+       return 0;
+}
+
+static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
+                           struct ib_qp_init_attr *init,
+                           struct ib_ucontext *context, struct ib_udata *udata)
+{
+       int err;
+       int wqe_size;
+
+       if (!qp->srq) {
+               qp->rq.max_wr           = init->cap.max_recv_wr;
+               qp->rq.max_sge          = init->cap.max_recv_sge;
+
+               wqe_size = rcv_wqe_size(qp->rq.max_sge);
+
+               pr_debug("max_wr = %d, max_sge = %d, wqe_size = %d\n",
+                        qp->rq.max_wr, qp->rq.max_sge, wqe_size);
+
+               qp->rq.queue = rxe_queue_init(rxe,
+                                             &qp->rq.max_wr,
+                                             wqe_size);
+               if (!qp->rq.queue)
+                       return -ENOMEM;
+
+               err = do_mmap_info(rxe, udata, false, context,
+                                  qp->rq.queue->buf,
+                                  qp->rq.queue->buf_size,
+                                  &qp->rq.queue->ip);
+               if (err) {
+                       kvfree(qp->rq.queue->buf);
+                       kfree(qp->rq.queue);
+                       return err;
+               }
+       }
+
+       spin_lock_init(&qp->rq.producer_lock);
+       spin_lock_init(&qp->rq.consumer_lock);
+
+       skb_queue_head_init(&qp->resp_pkts);
+
+       rxe_init_task(rxe, &qp->resp.task, qp,
+                     rxe_responder, "resp");
+
+       qp->resp.opcode         = OPCODE_NONE;
+       qp->resp.msn            = 0;
+       qp->resp.state          = QP_STATE_RESET;
+
+       return 0;
+}
+
+/* called by the create qp verb */
+int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
+                    struct ib_qp_init_attr *init, struct ib_udata *udata,
+                    struct ib_pd *ibpd)
+{
+       int err;
+       struct rxe_cq *rcq = to_rcq(init->recv_cq);
+       struct rxe_cq *scq = to_rcq(init->send_cq);
+       struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL;
+       struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL;
+
+       rxe_add_ref(pd);
+       rxe_add_ref(rcq);
+       rxe_add_ref(scq);
+       if (srq)
+               rxe_add_ref(srq);
+
+       qp->pd                  = pd;
+       qp->rcq                 = rcq;
+       qp->scq                 = scq;
+       qp->srq                 = srq;
+
+       rxe_qp_init_misc(rxe, qp, init);
+
+       err = rxe_qp_init_req(rxe, qp, init, context, udata);
+       if (err)
+               goto err1;
+
+       err = rxe_qp_init_resp(rxe, qp, init, context, udata);
+       if (err)
+               goto err2;
+
+       qp->attr.qp_state = IB_QPS_RESET;
+       qp->valid = 1;
+
+       return 0;
+
+err2:
+       rxe_queue_cleanup(qp->sq.queue);
+err1:
+       if (srq)
+               rxe_drop_ref(srq);
+       rxe_drop_ref(scq);
+       rxe_drop_ref(rcq);
+       rxe_drop_ref(pd);
+
+       return err;
+}
+
+/* called by the query qp verb */
+int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init)
+{
+       init->event_handler             = qp->ibqp.event_handler;
+       init->qp_context                = qp->ibqp.qp_context;
+       init->send_cq                   = qp->ibqp.send_cq;
+       init->recv_cq                   = qp->ibqp.recv_cq;
+       init->srq                       = qp->ibqp.srq;
+
+       init->cap.max_send_wr           = qp->sq.max_wr;
+       init->cap.max_send_sge          = qp->sq.max_sge;
+       init->cap.max_inline_data       = qp->sq.max_inline;
+
+       if (!qp->srq) {
+               init->cap.max_recv_wr           = qp->rq.max_wr;
+               init->cap.max_recv_sge          = qp->rq.max_sge;
+       }
+
+       init->sq_sig_type               = qp->sq_sig_type;
+
+       init->qp_type                   = qp->ibqp.qp_type;
+       init->port_num                  = 1;
+
+       return 0;
+}
+
+/* called by the modify qp verb, this routine checks all the parameters before
+ * making any changes
+ */
+int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
+                   struct ib_qp_attr *attr, int mask)
+{
+       enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ?
+                                       attr->cur_qp_state : qp->attr.qp_state;
+       enum ib_qp_state new_state = (mask & IB_QP_STATE) ?
+                                       attr->qp_state : cur_state;
+
+       if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask,
+                               IB_LINK_LAYER_ETHERNET)) {
+               pr_warn("invalid mask or state for qp\n");
+               goto err1;
+       }
+
+       if (mask & IB_QP_STATE) {
+               if (cur_state == IB_QPS_SQD) {
+                       if (qp->req.state == QP_STATE_DRAIN &&
+                           new_state != IB_QPS_ERR)
+                               goto err1;
+               }
+       }
+
+       if (mask & IB_QP_PORT) {
+               if (attr->port_num != 1) {
+                       pr_warn("invalid port %d\n", attr->port_num);
+                       goto err1;
+               }
+       }
+
+       if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq))
+               goto err1;
+
+       if (mask & IB_QP_AV && rxe_av_chk_attr(rxe, &attr->ah_attr))
+               goto err1;
+
+       if (mask & IB_QP_ALT_PATH) {
+               if (rxe_av_chk_attr(rxe, &attr->alt_ah_attr))
+                       goto err1;
+               if (attr->alt_port_num != 1) {
+                       pr_warn("invalid alt port %d\n", attr->alt_port_num);
+                       goto err1;
+               }
+               if (attr->alt_timeout > 31) {
+                       pr_warn("invalid QP alt timeout %d > 31\n",
+                               attr->alt_timeout);
+                       goto err1;
+               }
+       }
+
+       if (mask & IB_QP_PATH_MTU) {
+               struct rxe_port *port = &rxe->port;
+
+               enum ib_mtu max_mtu = port->attr.max_mtu;
+               enum ib_mtu mtu = attr->path_mtu;
+
+               if (mtu > max_mtu) {
+                       pr_debug("invalid mtu (%d) > (%d)\n",
+                                ib_mtu_enum_to_int(mtu),
+                                ib_mtu_enum_to_int(max_mtu));
+                       goto err1;
+               }
+       }
+
+       if (mask & IB_QP_MAX_QP_RD_ATOMIC) {
+               if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) {
+                       pr_warn("invalid max_rd_atomic %d > %d\n",
+                               attr->max_rd_atomic,
+                               rxe->attr.max_qp_rd_atom);
+                       goto err1;
+               }
+       }
+
+       if (mask & IB_QP_TIMEOUT) {
+               if (attr->timeout > 31) {
+                       pr_warn("invalid QP timeout %d > 31\n",
+                               attr->timeout);
+                       goto err1;
+               }
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+/* move the qp to the reset state */
+static void rxe_qp_reset(struct rxe_qp *qp)
+{
+       /* stop tasks from running */
+       rxe_disable_task(&qp->resp.task);
+
+       /* stop request/comp */
+       if (qp->sq.queue) {
+               if (qp_type(qp) == IB_QPT_RC)
+                       rxe_disable_task(&qp->comp.task);
+               rxe_disable_task(&qp->req.task);
+       }
+
+       /* move qp to the reset state */
+       qp->req.state = QP_STATE_RESET;
+       qp->resp.state = QP_STATE_RESET;
+
+       /* let state machines reset themselves drain work and packet queues
+        * etc.
+        */
+       __rxe_do_task(&qp->resp.task);
+
+       if (qp->sq.queue) {
+               __rxe_do_task(&qp->comp.task);
+               __rxe_do_task(&qp->req.task);
+       }
+
+       /* cleanup attributes */
+       atomic_set(&qp->ssn, 0);
+       qp->req.opcode = -1;
+       qp->req.need_retry = 0;
+       qp->req.noack_pkts = 0;
+       qp->resp.msn = 0;
+       qp->resp.opcode = -1;
+       qp->resp.drop_msg = 0;
+       qp->resp.goto_error = 0;
+       qp->resp.sent_psn_nak = 0;
+
+       if (qp->resp.mr) {
+               rxe_drop_ref(qp->resp.mr);
+               qp->resp.mr = NULL;
+       }
+
+       cleanup_rd_atomic_resources(qp);
+
+       /* reenable tasks */
+       rxe_enable_task(&qp->resp.task);
+
+       if (qp->sq.queue) {
+               if (qp_type(qp) == IB_QPT_RC)
+                       rxe_enable_task(&qp->comp.task);
+
+               rxe_enable_task(&qp->req.task);
+       }
+}
+
+/* drain the send queue */
+static void rxe_qp_drain(struct rxe_qp *qp)
+{
+       if (qp->sq.queue) {
+               if (qp->req.state != QP_STATE_DRAINED) {
+                       qp->req.state = QP_STATE_DRAIN;
+                       if (qp_type(qp) == IB_QPT_RC)
+                               rxe_run_task(&qp->comp.task, 1);
+                       else
+                               __rxe_do_task(&qp->comp.task);
+                       rxe_run_task(&qp->req.task, 1);
+               }
+       }
+}
+
+/* move the qp to the error state */
+void rxe_qp_error(struct rxe_qp *qp)
+{
+       qp->req.state = QP_STATE_ERROR;
+       qp->resp.state = QP_STATE_ERROR;
+
+       /* drain work and packet queues */
+       rxe_run_task(&qp->resp.task, 1);
+
+       if (qp_type(qp) == IB_QPT_RC)
+               rxe_run_task(&qp->comp.task, 1);
+       else
+               __rxe_do_task(&qp->comp.task);
+       rxe_run_task(&qp->req.task, 1);
+}
+
+/* called by the modify qp verb */
+int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
+                    struct ib_udata *udata)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       union ib_gid sgid;
+       struct ib_gid_attr sgid_attr;
+
+       if (mask & IB_QP_MAX_QP_RD_ATOMIC) {
+               int max_rd_atomic = __roundup_pow_of_two(attr->max_rd_atomic);
+
+               free_rd_atomic_resources(qp);
+
+               err = alloc_rd_atomic_resources(qp, max_rd_atomic);
+               if (err)
+                       return err;
+
+               qp->attr.max_rd_atomic = max_rd_atomic;
+               atomic_set(&qp->req.rd_atomic, max_rd_atomic);
+       }
+
+       if (mask & IB_QP_CUR_STATE)
+               qp->attr.cur_qp_state = attr->qp_state;
+
+       if (mask & IB_QP_EN_SQD_ASYNC_NOTIFY)
+               qp->attr.en_sqd_async_notify = attr->en_sqd_async_notify;
+
+       if (mask & IB_QP_ACCESS_FLAGS)
+               qp->attr.qp_access_flags = attr->qp_access_flags;
+
+       if (mask & IB_QP_PKEY_INDEX)
+               qp->attr.pkey_index = attr->pkey_index;
+
+       if (mask & IB_QP_PORT)
+               qp->attr.port_num = attr->port_num;
+
+       if (mask & IB_QP_QKEY)
+               qp->attr.qkey = attr->qkey;
+
+       if (mask & IB_QP_AV) {
+               ib_get_cached_gid(&rxe->ib_dev, 1,
+                                 attr->ah_attr.grh.sgid_index, &sgid,
+                                 &sgid_attr);
+               rxe_av_from_attr(rxe, attr->port_num, &qp->pri_av,
+                                &attr->ah_attr);
+               rxe_av_fill_ip_info(rxe, &qp->pri_av, &attr->ah_attr,
+                                   &sgid_attr, &sgid);
+               if (sgid_attr.ndev)
+                       dev_put(sgid_attr.ndev);
+       }
+
+       if (mask & IB_QP_ALT_PATH) {
+               ib_get_cached_gid(&rxe->ib_dev, 1,
+                                 attr->alt_ah_attr.grh.sgid_index, &sgid,
+                                 &sgid_attr);
+
+               rxe_av_from_attr(rxe, attr->alt_port_num, &qp->alt_av,
+                                &attr->alt_ah_attr);
+               rxe_av_fill_ip_info(rxe, &qp->alt_av, &attr->alt_ah_attr,
+                                   &sgid_attr, &sgid);
+               if (sgid_attr.ndev)
+                       dev_put(sgid_attr.ndev);
+
+               qp->attr.alt_port_num = attr->alt_port_num;
+               qp->attr.alt_pkey_index = attr->alt_pkey_index;
+               qp->attr.alt_timeout = attr->alt_timeout;
+       }
+
+       if (mask & IB_QP_PATH_MTU) {
+               qp->attr.path_mtu = attr->path_mtu;
+               qp->mtu = ib_mtu_enum_to_int(attr->path_mtu);
+       }
+
+       if (mask & IB_QP_TIMEOUT) {
+               qp->attr.timeout = attr->timeout;
+               if (attr->timeout == 0) {
+                       qp->qp_timeout_jiffies = 0;
+               } else {
+                       /* According to the spec, timeout = 4.096 * 2 ^ attr->timeout [us] */
+                       int j = nsecs_to_jiffies(4096ULL << attr->timeout);
+
+                       qp->qp_timeout_jiffies = j ? j : 1;
+               }
+       }
+
+       if (mask & IB_QP_RETRY_CNT) {
+               qp->attr.retry_cnt = attr->retry_cnt;
+               qp->comp.retry_cnt = attr->retry_cnt;
+               pr_debug("set retry count = %d\n", attr->retry_cnt);
+       }
+
+       if (mask & IB_QP_RNR_RETRY) {
+               qp->attr.rnr_retry = attr->rnr_retry;
+               qp->comp.rnr_retry = attr->rnr_retry;
+               pr_debug("set rnr retry count = %d\n", attr->rnr_retry);
+       }
+
+       if (mask & IB_QP_RQ_PSN) {
+               qp->attr.rq_psn = (attr->rq_psn & BTH_PSN_MASK);
+               qp->resp.psn = qp->attr.rq_psn;
+               pr_debug("set resp psn = 0x%x\n", qp->resp.psn);
+       }
+
+       if (mask & IB_QP_MIN_RNR_TIMER) {
+               qp->attr.min_rnr_timer = attr->min_rnr_timer;
+               pr_debug("set min rnr timer = 0x%x\n",
+                        attr->min_rnr_timer);
+       }
+
+       if (mask & IB_QP_SQ_PSN) {
+               qp->attr.sq_psn = (attr->sq_psn & BTH_PSN_MASK);
+               qp->req.psn = qp->attr.sq_psn;
+               qp->comp.psn = qp->attr.sq_psn;
+               pr_debug("set req psn = 0x%x\n", qp->req.psn);
+       }
+
+       if (mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+               qp->attr.max_dest_rd_atomic =
+                       __roundup_pow_of_two(attr->max_dest_rd_atomic);
+       }
+
+       if (mask & IB_QP_PATH_MIG_STATE)
+               qp->attr.path_mig_state = attr->path_mig_state;
+
+       if (mask & IB_QP_DEST_QPN)
+               qp->attr.dest_qp_num = attr->dest_qp_num;
+
+       if (mask & IB_QP_STATE) {
+               qp->attr.qp_state = attr->qp_state;
+
+               switch (attr->qp_state) {
+               case IB_QPS_RESET:
+                       pr_debug("qp state -> RESET\n");
+                       rxe_qp_reset(qp);
+                       break;
+
+               case IB_QPS_INIT:
+                       pr_debug("qp state -> INIT\n");
+                       qp->req.state = QP_STATE_INIT;
+                       qp->resp.state = QP_STATE_INIT;
+                       break;
+
+               case IB_QPS_RTR:
+                       pr_debug("qp state -> RTR\n");
+                       qp->resp.state = QP_STATE_READY;
+                       break;
+
+               case IB_QPS_RTS:
+                       pr_debug("qp state -> RTS\n");
+                       qp->req.state = QP_STATE_READY;
+                       break;
+
+               case IB_QPS_SQD:
+                       pr_debug("qp state -> SQD\n");
+                       rxe_qp_drain(qp);
+                       break;
+
+               case IB_QPS_SQE:
+                       pr_warn("qp state -> SQE !!?\n");
+                       /* Not possible from modify_qp. */
+                       break;
+
+               case IB_QPS_ERR:
+                       pr_debug("qp state -> ERR\n");
+                       rxe_qp_error(qp);
+                       break;
+               }
+       }
+
+       return 0;
+}
+
+/* called by the query qp verb */
+int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+       *attr = qp->attr;
+
+       attr->rq_psn                            = qp->resp.psn;
+       attr->sq_psn                            = qp->req.psn;
+
+       attr->cap.max_send_wr                   = qp->sq.max_wr;
+       attr->cap.max_send_sge                  = qp->sq.max_sge;
+       attr->cap.max_inline_data               = qp->sq.max_inline;
+
+       if (!qp->srq) {
+               attr->cap.max_recv_wr           = qp->rq.max_wr;
+               attr->cap.max_recv_sge          = qp->rq.max_sge;
+       }
+
+       rxe_av_to_attr(rxe, &qp->pri_av, &attr->ah_attr);
+       rxe_av_to_attr(rxe, &qp->alt_av, &attr->alt_ah_attr);
+
+       if (qp->req.state == QP_STATE_DRAIN) {
+               attr->sq_draining = 1;
+               /* applications that get this state
+                * typically spin on it. yield the
+                * processor
+                */
+               cond_resched();
+       } else {
+               attr->sq_draining = 0;
+       }
+
+       pr_debug("attr->sq_draining = %d\n", attr->sq_draining);
+
+       return 0;
+}
+
+/* called by the destroy qp verb */
+void rxe_qp_destroy(struct rxe_qp *qp)
+{
+       qp->valid = 0;
+       qp->qp_timeout_jiffies = 0;
+       rxe_cleanup_task(&qp->resp.task);
+
+       del_timer_sync(&qp->retrans_timer);
+       del_timer_sync(&qp->rnr_nak_timer);
+
+       rxe_cleanup_task(&qp->req.task);
+       if (qp_type(qp) == IB_QPT_RC)
+               rxe_cleanup_task(&qp->comp.task);
+
+       /* flush out any receive wr's or pending requests */
+       __rxe_do_task(&qp->req.task);
+       if (qp->sq.queue) {
+               __rxe_do_task(&qp->comp.task);
+               __rxe_do_task(&qp->req.task);
+       }
+}
+
+/* called when the last reference to the qp is dropped */
+void rxe_qp_cleanup(void *arg)
+{
+       struct rxe_qp *qp = arg;
+
+       rxe_drop_all_mcast_groups(qp);
+
+       if (qp->sq.queue)
+               rxe_queue_cleanup(qp->sq.queue);
+
+       if (qp->srq)
+               rxe_drop_ref(qp->srq);
+
+       if (qp->rq.queue)
+               rxe_queue_cleanup(qp->rq.queue);
+
+       if (qp->scq)
+               rxe_drop_ref(qp->scq);
+       if (qp->rcq)
+               rxe_drop_ref(qp->rcq);
+       if (qp->pd)
+               rxe_drop_ref(qp->pd);
+
+       if (qp->resp.mr) {
+               rxe_drop_ref(qp->resp.mr);
+               qp->resp.mr = NULL;
+       }
+
+       free_rd_atomic_resources(qp);
+
+       kernel_sock_shutdown(qp->sk, SHUT_RDWR);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c

new file mode 100644 (file)

index 0000000..0827425
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_queue.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must retailuce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/vmalloc.h>
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int do_mmap_info(struct rxe_dev *rxe,
+                struct ib_udata *udata,
+                bool is_req,
+                struct ib_ucontext *context,
+                struct rxe_queue_buf *buf,
+                size_t buf_size,
+                struct rxe_mmap_info **ip_p)
+{
+       int err;
+       u32 len, offset;
+       struct rxe_mmap_info *ip = NULL;
+
+       if (udata) {
+               if (is_req) {
+                       len = udata->outlen - sizeof(struct mminfo);
+                       offset = sizeof(struct mminfo);
+               } else {
+                       len = udata->outlen;
+                       offset = 0;
+               }
+
+               if (len < sizeof(ip->info))
+                       goto err1;
+
+               ip = rxe_create_mmap_info(rxe, buf_size, context, buf);
+               if (!ip)
+                       goto err1;
+
+               err = copy_to_user(udata->outbuf + offset, &ip->info,
+                                  sizeof(ip->info));
+               if (err)
+                       goto err2;
+
+               spin_lock_bh(&rxe->pending_lock);
+               list_add(&ip->pending_mmaps, &rxe->pending_mmaps);
+               spin_unlock_bh(&rxe->pending_lock);
+       }
+
+       *ip_p = ip;
+
+       return 0;
+
+err2:
+       kfree(ip);
+err1:
+       return -EINVAL;
+}
+
+struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe,
+                                int *num_elem,
+                                unsigned int elem_size)
+{
+       struct rxe_queue *q;
+       size_t buf_size;
+       unsigned int num_slots;
+
+       /* num_elem == 0 is allowed, but uninteresting */
+       if (*num_elem < 0)
+               goto err1;
+
+       q = kmalloc(sizeof(*q), GFP_KERNEL);
+       if (!q)
+               goto err1;
+
+       q->rxe = rxe;
+
+       /* used in resize, only need to copy used part of queue */
+       q->elem_size = elem_size;
+
+       /* pad element up to at least a cacheline and always a power of 2 */
+       if (elem_size < cache_line_size())
+               elem_size = cache_line_size();
+       elem_size = roundup_pow_of_two(elem_size);
+
+       q->log2_elem_size = order_base_2(elem_size);
+
+       num_slots = *num_elem + 1;
+       num_slots = roundup_pow_of_two(num_slots);
+       q->index_mask = num_slots - 1;
+
+       buf_size = sizeof(struct rxe_queue_buf) + num_slots * elem_size;
+
+       q->buf = vmalloc_user(buf_size);
+       if (!q->buf)
+               goto err2;
+
+       q->buf->log2_elem_size = q->log2_elem_size;
+       q->buf->index_mask = q->index_mask;
+
+       q->buf_size = buf_size;
+
+       *num_elem = num_slots - 1;
+       return q;
+
+err2:
+       kfree(q);
+err1:
+       return NULL;
+}
+
+/* copies elements from original q to new q and then swaps the contents of the
+ * two q headers. This is so that if anyone is holding a pointer to q it will
+ * still work
+ */
+static int resize_finish(struct rxe_queue *q, struct rxe_queue *new_q,
+                        unsigned int num_elem)
+{
+       if (!queue_empty(q) && (num_elem < queue_count(q)))
+               return -EINVAL;
+
+       while (!queue_empty(q)) {
+               memcpy(producer_addr(new_q), consumer_addr(q),
+                      new_q->elem_size);
+               advance_producer(new_q);
+               advance_consumer(q);
+       }
+
+       swap(*q, *new_q);
+
+       return 0;
+}
+
+int rxe_queue_resize(struct rxe_queue *q,
+                    unsigned int *num_elem_p,
+                    unsigned int elem_size,
+                    struct ib_ucontext *context,
+                    struct ib_udata *udata,
+                    spinlock_t *producer_lock,
+                    spinlock_t *consumer_lock)
+{
+       struct rxe_queue *new_q;
+       unsigned int num_elem = *num_elem_p;
+       int err;
+       unsigned long flags = 0, flags1;
+
+       new_q = rxe_queue_init(q->rxe, &num_elem, elem_size);
+       if (!new_q)
+               return -ENOMEM;
+
+       err = do_mmap_info(new_q->rxe, udata, false, context, new_q->buf,
+                          new_q->buf_size, &new_q->ip);
+       if (err) {
+               vfree(new_q->buf);
+               kfree(new_q);
+               goto err1;
+       }
+
+       spin_lock_irqsave(consumer_lock, flags1);
+
+       if (producer_lock) {
+               spin_lock_irqsave(producer_lock, flags);
+               err = resize_finish(q, new_q, num_elem);
+               spin_unlock_irqrestore(producer_lock, flags);
+       } else {
+               err = resize_finish(q, new_q, num_elem);
+       }
+
+       spin_unlock_irqrestore(consumer_lock, flags1);
+
+       rxe_queue_cleanup(new_q);       /* new/old dep on err */
+       if (err)
+               goto err1;
+
+       *num_elem_p = num_elem;
+       return 0;
+
+err1:
+       return err;
+}
+
+void rxe_queue_cleanup(struct rxe_queue *q)
+{
+       if (q->ip)
+               kref_put(&q->ip->ref, rxe_mmap_release);
+       else
+               vfree(q->buf);
+
+       kfree(q);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h

new file mode 100644 (file)

index 0000000..239fd60
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_queue.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_QUEUE_H
+#define RXE_QUEUE_H
+
+/* implements a simple circular buffer that can optionally be
+ * shared between user space and the kernel and can be resized
+
+ * the requested element size is rounded up to a power of 2
+ * and the number of elements in the buffer is also rounded
+ * up to a power of 2. Since the queue is empty when the
+ * producer and consumer indices match the maximum capacity
+ * of the queue is one less than the number of element slots
+ */
+
+/* this data structure is shared between user space and kernel
+ * space for those cases where the queue is shared. It contains
+ * the producer and consumer indices. Is also contains a copy
+ * of the queue size parameters for user space to use but the
+ * kernel must use the parameters in the rxe_queue struct
+ * this MUST MATCH the corresponding librxe struct
+ * for performance reasons arrange to have producer and consumer
+ * pointers in separate cache lines
+ * the kernel should always mask the indices to avoid accessing
+ * memory outside of the data area
+ */
+struct rxe_queue_buf {
+       __u32                   log2_elem_size;
+       __u32                   index_mask;
+       __u32                   pad_1[30];
+       __u32                   producer_index;
+       __u32                   pad_2[31];
+       __u32                   consumer_index;
+       __u32                   pad_3[31];
+       __u8                    data[0];
+};
+
+struct rxe_queue {
+       struct rxe_dev          *rxe;
+       struct rxe_queue_buf    *buf;
+       struct rxe_mmap_info    *ip;
+       size_t                  buf_size;
+       size_t                  elem_size;
+       unsigned int            log2_elem_size;
+       unsigned int            index_mask;
+};
+
+int do_mmap_info(struct rxe_dev *rxe,
+                struct ib_udata *udata,
+                bool is_req,
+                struct ib_ucontext *context,
+                struct rxe_queue_buf *buf,
+                size_t buf_size,
+                struct rxe_mmap_info **ip_p);
+
+struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe,
+                                int *num_elem,
+                                unsigned int elem_size);
+
+int rxe_queue_resize(struct rxe_queue *q,
+                    unsigned int *num_elem_p,
+                    unsigned int elem_size,
+                    struct ib_ucontext *context,
+                    struct ib_udata *udata,
+                    /* Protect producers while resizing queue */
+                    spinlock_t *producer_lock,
+                    /* Protect consumers while resizing queue */
+                    spinlock_t *consumer_lock);
+
+void rxe_queue_cleanup(struct rxe_queue *queue);
+
+static inline int next_index(struct rxe_queue *q, int index)
+{
+       return (index + 1) & q->buf->index_mask;
+}
+
+static inline int queue_empty(struct rxe_queue *q)
+{
+       return ((q->buf->producer_index - q->buf->consumer_index)
+                       & q->index_mask) == 0;
+}
+
+static inline int queue_full(struct rxe_queue *q)
+{
+       return ((q->buf->producer_index + 1 - q->buf->consumer_index)
+                       & q->index_mask) == 0;
+}
+
+static inline void advance_producer(struct rxe_queue *q)
+{
+       q->buf->producer_index = (q->buf->producer_index + 1)
+                       & q->index_mask;
+}
+
+static inline void advance_consumer(struct rxe_queue *q)
+{
+       q->buf->consumer_index = (q->buf->consumer_index + 1)
+                       & q->index_mask;
+}
+
+static inline void *producer_addr(struct rxe_queue *q)
+{
+       return q->buf->data + ((q->buf->producer_index & q->index_mask)
+                               << q->log2_elem_size);
+}
+
+static inline void *consumer_addr(struct rxe_queue *q)
+{
+       return q->buf->data + ((q->buf->consumer_index & q->index_mask)
+                               << q->log2_elem_size);
+}
+
+static inline unsigned int producer_index(struct rxe_queue *q)
+{
+       return q->buf->producer_index;
+}
+
+static inline unsigned int consumer_index(struct rxe_queue *q)
+{
+       return q->buf->consumer_index;
+}
+
+static inline void *addr_from_index(struct rxe_queue *q, unsigned int index)
+{
+       return q->buf->data + ((index & q->index_mask)
+                               << q->buf->log2_elem_size);
+}
+
+static inline unsigned int index_from_addr(const struct rxe_queue *q,
+                                          const void *addr)
+{
+       return (((u8 *)addr - q->buf->data) >> q->log2_elem_size)
+               & q->index_mask;
+}
+
+static inline unsigned int queue_count(const struct rxe_queue *q)
+{
+       return (q->buf->producer_index - q->buf->consumer_index)
+               & q->index_mask;
+}
+
+static inline void *queue_head(struct rxe_queue *q)
+{
+       return queue_empty(q) ? NULL : consumer_addr(q);
+}
+
+#endif /* RXE_QUEUE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c

new file mode 100644 (file)

index 0000000..3d464c2
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_recv.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+                           struct rxe_qp *qp)
+{
+       if (unlikely(!qp->valid))
+               goto err1;
+
+       switch (qp_type(qp)) {
+       case IB_QPT_RC:
+               if (unlikely((pkt->opcode & IB_OPCODE_RC) != 0)) {
+                       pr_warn_ratelimited("bad qp type\n");
+                       goto err1;
+               }
+               break;
+       case IB_QPT_UC:
+               if (unlikely(!(pkt->opcode & IB_OPCODE_UC))) {
+                       pr_warn_ratelimited("bad qp type\n");
+                       goto err1;
+               }
+               break;
+       case IB_QPT_UD:
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               if (unlikely(!(pkt->opcode & IB_OPCODE_UD))) {
+                       pr_warn_ratelimited("bad qp type\n");
+                       goto err1;
+               }
+               break;
+       default:
+               pr_warn_ratelimited("unsupported qp type\n");
+               goto err1;
+       }
+
+       if (pkt->mask & RXE_REQ_MASK) {
+               if (unlikely(qp->resp.state != QP_STATE_READY))
+                       goto err1;
+       } else if (unlikely(qp->req.state < QP_STATE_READY ||
+                               qp->req.state > QP_STATE_DRAINED)) {
+               goto err1;
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static void set_bad_pkey_cntr(struct rxe_port *port)
+{
+       spin_lock_bh(&port->port_lock);
+       port->attr.bad_pkey_cntr = min((u32)0xffff,
+                                      port->attr.bad_pkey_cntr + 1);
+       spin_unlock_bh(&port->port_lock);
+}
+
+static void set_qkey_viol_cntr(struct rxe_port *port)
+{
+       spin_lock_bh(&port->port_lock);
+       port->attr.qkey_viol_cntr = min((u32)0xffff,
+                                       port->attr.qkey_viol_cntr + 1);
+       spin_unlock_bh(&port->port_lock);
+}
+
+static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+                     u32 qpn, struct rxe_qp *qp)
+{
+       int i;
+       int found_pkey = 0;
+       struct rxe_port *port = &rxe->port;
+       u16 pkey = bth_pkey(pkt);
+
+       pkt->pkey_index = 0;
+
+       if (qpn == 1) {
+               for (i = 0; i < port->attr.pkey_tbl_len; i++) {
+                       if (pkey_match(pkey, port->pkey_tbl[i])) {
+                               pkt->pkey_index = i;
+                               found_pkey = 1;
+                               break;
+                       }
+               }
+
+               if (!found_pkey) {
+                       pr_warn_ratelimited("bad pkey = 0x%x\n", pkey);
+                       set_bad_pkey_cntr(port);
+                       goto err1;
+               }
+       } else if (qpn != 0) {
+               if (unlikely(!pkey_match(pkey,
+                                        port->pkey_tbl[qp->attr.pkey_index]
+                                       ))) {
+                       pr_warn_ratelimited("bad pkey = 0x%0x\n", pkey);
+                       set_bad_pkey_cntr(port);
+                       goto err1;
+               }
+               pkt->pkey_index = qp->attr.pkey_index;
+       }
+
+       if ((qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) &&
+           qpn != 0 && pkt->mask) {
+               u32 qkey = (qpn == 1) ? GSI_QKEY : qp->attr.qkey;
+
+               if (unlikely(deth_qkey(pkt) != qkey)) {
+                       pr_warn_ratelimited("bad qkey, got 0x%x expected 0x%x for qpn 0x%x\n",
+                                           deth_qkey(pkt), qkey, qpn);
+                       set_qkey_viol_cntr(port);
+                       goto err1;
+               }
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static int check_addr(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+                     struct rxe_qp *qp)
+{
+       struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+       if (qp_type(qp) != IB_QPT_RC && qp_type(qp) != IB_QPT_UC)
+               goto done;
+
+       if (unlikely(pkt->port_num != qp->attr.port_num)) {
+               pr_warn_ratelimited("port %d != qp port %d\n",
+                                   pkt->port_num, qp->attr.port_num);
+               goto err1;
+       }
+
+       if (skb->protocol == htons(ETH_P_IP)) {
+               struct in_addr *saddr =
+                       &qp->pri_av.sgid_addr._sockaddr_in.sin_addr;
+               struct in_addr *daddr =
+                       &qp->pri_av.dgid_addr._sockaddr_in.sin_addr;
+
+               if (ip_hdr(skb)->daddr != saddr->s_addr) {
+                       pr_warn_ratelimited("dst addr %pI4 != qp source addr %pI4\n",
+                                           &ip_hdr(skb)->daddr,
+                                           &saddr->s_addr);
+                       goto err1;
+               }
+
+               if (ip_hdr(skb)->saddr != daddr->s_addr) {
+                       pr_warn_ratelimited("source addr %pI4 != qp dst addr %pI4\n",
+                                           &ip_hdr(skb)->saddr,
+                                           &daddr->s_addr);
+                       goto err1;
+               }
+
+       } else if (skb->protocol == htons(ETH_P_IPV6)) {
+               struct in6_addr *saddr =
+                       &qp->pri_av.sgid_addr._sockaddr_in6.sin6_addr;
+               struct in6_addr *daddr =
+                       &qp->pri_av.dgid_addr._sockaddr_in6.sin6_addr;
+
+               if (memcmp(&ipv6_hdr(skb)->daddr, saddr, sizeof(*saddr))) {
+                       pr_warn_ratelimited("dst addr %pI6 != qp source addr %pI6\n",
+                                           &ipv6_hdr(skb)->daddr, saddr);
+                       goto err1;
+               }
+
+               if (memcmp(&ipv6_hdr(skb)->saddr, daddr, sizeof(*daddr))) {
+                       pr_warn_ratelimited("source addr %pI6 != qp dst addr %pI6\n",
+                                           &ipv6_hdr(skb)->saddr, daddr);
+                       goto err1;
+               }
+       }
+
+done:
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static int hdr_check(struct rxe_pkt_info *pkt)
+{
+       struct rxe_dev *rxe = pkt->rxe;
+       struct rxe_port *port = &rxe->port;
+       struct rxe_qp *qp = NULL;
+       u32 qpn = bth_qpn(pkt);
+       int index;
+       int err;
+
+       if (unlikely(bth_tver(pkt) != BTH_TVER)) {
+               pr_warn_ratelimited("bad tver\n");
+               goto err1;
+       }
+
+       if (qpn != IB_MULTICAST_QPN) {
+               index = (qpn == 0) ? port->qp_smi_index :
+                       ((qpn == 1) ? port->qp_gsi_index : qpn);
+               qp = rxe_pool_get_index(&rxe->qp_pool, index);
+               if (unlikely(!qp)) {
+                       pr_warn_ratelimited("no qp matches qpn 0x%x\n", qpn);
+                       goto err1;
+               }
+
+               err = check_type_state(rxe, pkt, qp);
+               if (unlikely(err))
+                       goto err2;
+
+               err = check_addr(rxe, pkt, qp);
+               if (unlikely(err))
+                       goto err2;
+
+               err = check_keys(rxe, pkt, qpn, qp);
+               if (unlikely(err))
+                       goto err2;
+       } else {
+               if (unlikely((pkt->mask & RXE_GRH_MASK) == 0)) {
+                       pr_warn_ratelimited("no grh for mcast qpn\n");
+                       goto err1;
+               }
+       }
+
+       pkt->qp = qp;
+       return 0;
+
+err2:
+       if (qp)
+               rxe_drop_ref(qp);
+err1:
+       return -EINVAL;
+}
+
+static inline void rxe_rcv_pkt(struct rxe_dev *rxe,
+                              struct rxe_pkt_info *pkt,
+                              struct sk_buff *skb)
+{
+       if (pkt->mask & RXE_REQ_MASK)
+               rxe_resp_queue_pkt(rxe, pkt->qp, skb);
+       else
+               rxe_comp_queue_pkt(rxe, pkt->qp, skb);
+}
+
+static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
+{
+       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+       struct rxe_mc_grp *mcg;
+       struct sk_buff *skb_copy;
+       struct rxe_mc_elem *mce;
+       struct rxe_qp *qp;
+       union ib_gid dgid;
+       int err;
+
+       if (skb->protocol == htons(ETH_P_IP))
+               ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
+                                      (struct in6_addr *)&dgid);
+       else if (skb->protocol == htons(ETH_P_IPV6))
+               memcpy(&dgid, &ipv6_hdr(skb)->daddr, sizeof(dgid));
+
+       /* lookup mcast group corresponding to mgid, takes a ref */
+       mcg = rxe_pool_get_key(&rxe->mc_grp_pool, &dgid);
+       if (!mcg)
+               goto err1;      /* mcast group not registered */
+
+       spin_lock_bh(&mcg->mcg_lock);
+
+       list_for_each_entry(mce, &mcg->qp_list, qp_list) {
+               qp = mce->qp;
+               pkt = SKB_TO_PKT(skb);
+
+               /* validate qp for incoming packet */
+               err = check_type_state(rxe, pkt, qp);
+               if (err)
+                       continue;
+
+               err = check_keys(rxe, pkt, bth_qpn(pkt), qp);
+               if (err)
+                       continue;
+
+               /* if *not* the last qp in the list
+                * make a copy of the skb to post to the next qp
+                */
+               skb_copy = (mce->qp_list.next != &mcg->qp_list) ?
+                               skb_clone(skb, GFP_KERNEL) : NULL;
+
+               pkt->qp = qp;
+               rxe_add_ref(qp);
+               rxe_rcv_pkt(rxe, pkt, skb);
+
+               skb = skb_copy;
+               if (!skb)
+                       break;
+       }
+
+       spin_unlock_bh(&mcg->mcg_lock);
+
+       rxe_drop_ref(mcg);      /* drop ref from rxe_pool_get_key. */
+
+err1:
+       if (skb)
+               kfree_skb(skb);
+}
+
+static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb)
+{
+       union ib_gid dgid;
+       union ib_gid *pdgid;
+       u16 index;
+
+       if (skb->protocol == htons(ETH_P_IP)) {
+               ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
+                                      (struct in6_addr *)&dgid);
+               pdgid = &dgid;
+       } else {
+               pdgid = (union ib_gid *)&ipv6_hdr(skb)->daddr;
+       }
+
+       return ib_find_cached_gid_by_port(&rxe->ib_dev, pdgid,
+                                         IB_GID_TYPE_ROCE_UDP_ENCAP,
+                                         1, rxe->ndev, &index);
+}
+
+/* rxe_rcv is called from the interface driver */
+int rxe_rcv(struct sk_buff *skb)
+{
+       int err;
+       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+       struct rxe_dev *rxe = pkt->rxe;
+       __be32 *icrcp;
+       u32 calc_icrc, pack_icrc;
+
+       pkt->offset = 0;
+
+       if (unlikely(skb->len < pkt->offset + RXE_BTH_BYTES))
+               goto drop;
+
+       if (unlikely(rxe_match_dgid(rxe, skb) < 0)) {
+               pr_warn_ratelimited("failed matching dgid\n");
+               goto drop;
+       }
+
+       pkt->opcode = bth_opcode(pkt);
+       pkt->psn = bth_psn(pkt);
+       pkt->qp = NULL;
+       pkt->mask |= rxe_opcode[pkt->opcode].mask;
+
+       if (unlikely(skb->len < header_size(pkt)))
+               goto drop;
+
+       err = hdr_check(pkt);
+       if (unlikely(err))
+               goto drop;
+
+       /* Verify ICRC */
+       icrcp = (__be32 *)(pkt->hdr + pkt->paylen - RXE_ICRC_SIZE);
+       pack_icrc = be32_to_cpu(*icrcp);
+
+       calc_icrc = rxe_icrc_hdr(pkt, skb);
+       calc_icrc = crc32_le(calc_icrc, (u8 *)payload_addr(pkt), payload_size(pkt));
+       calc_icrc = cpu_to_be32(~calc_icrc);
+       if (unlikely(calc_icrc != pack_icrc)) {
+               char saddr[sizeof(struct in6_addr)];
+
+               if (skb->protocol == htons(ETH_P_IPV6))
+                       sprintf(saddr, "%pI6", &ipv6_hdr(skb)->saddr);
+               else if (skb->protocol == htons(ETH_P_IP))
+                       sprintf(saddr, "%pI4", &ip_hdr(skb)->saddr);
+               else
+                       sprintf(saddr, "unknown");
+
+               pr_warn_ratelimited("bad ICRC from %s\n", saddr);
+               goto drop;
+       }
+
+       if (unlikely(bth_qpn(pkt) == IB_MULTICAST_QPN))
+               rxe_rcv_mcast_pkt(rxe, skb);
+       else
+               rxe_rcv_pkt(rxe, pkt, skb);
+
+       return 0;
+
+drop:
+       if (pkt->qp)
+               rxe_drop_ref(pkt->qp);
+
+       kfree_skb(skb);
+       return 0;
+}
+EXPORT_SYMBOL(rxe_rcv);
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c

new file mode 100644 (file)

index 0000000..33b2d9d
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -0,0 +1,726 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+                      unsigned opcode);
+
+static inline void retry_first_write_send(struct rxe_qp *qp,
+                                         struct rxe_send_wqe *wqe,
+                                         unsigned mask, int npsn)
+{
+       int i;
+
+       for (i = 0; i < npsn; i++) {
+               int to_send = (wqe->dma.resid > qp->mtu) ?
+                               qp->mtu : wqe->dma.resid;
+
+               qp->req.opcode = next_opcode(qp, wqe,
+                                            wqe->wr.opcode);
+
+               if (wqe->wr.send_flags & IB_SEND_INLINE) {
+                       wqe->dma.resid -= to_send;
+                       wqe->dma.sge_offset += to_send;
+               } else {
+                       advance_dma_data(&wqe->dma, to_send);
+               }
+               if (mask & WR_WRITE_MASK)
+                       wqe->iova += qp->mtu;
+       }
+}
+
+static void req_retry(struct rxe_qp *qp)
+{
+       struct rxe_send_wqe *wqe;
+       unsigned int wqe_index;
+       unsigned int mask;
+       int npsn;
+       int first = 1;
+
+       wqe = queue_head(qp->sq.queue);
+       npsn = (qp->comp.psn - wqe->first_psn) & BTH_PSN_MASK;
+
+       qp->req.wqe_index       = consumer_index(qp->sq.queue);
+       qp->req.psn             = qp->comp.psn;
+       qp->req.opcode          = -1;
+
+       for (wqe_index = consumer_index(qp->sq.queue);
+               wqe_index != producer_index(qp->sq.queue);
+               wqe_index = next_index(qp->sq.queue, wqe_index)) {
+               wqe = addr_from_index(qp->sq.queue, wqe_index);
+               mask = wr_opcode_mask(wqe->wr.opcode, qp);
+
+               if (wqe->state == wqe_state_posted)
+                       break;
+
+               if (wqe->state == wqe_state_done)
+                       continue;
+
+               wqe->iova = (mask & WR_ATOMIC_MASK) ?
+                            wqe->wr.wr.atomic.remote_addr :
+                            (mask & WR_READ_OR_WRITE_MASK) ?
+                            wqe->wr.wr.rdma.remote_addr :
+                            0;
+
+               if (!first || (mask & WR_READ_MASK) == 0) {
+                       wqe->dma.resid = wqe->dma.length;
+                       wqe->dma.cur_sge = 0;
+                       wqe->dma.sge_offset = 0;
+               }
+
+               if (first) {
+                       first = 0;
+
+                       if (mask & WR_WRITE_OR_SEND_MASK)
+                               retry_first_write_send(qp, wqe, mask, npsn);
+
+                       if (mask & WR_READ_MASK)
+                               wqe->iova += npsn * qp->mtu;
+               }
+
+               wqe->state = wqe_state_posted;
+       }
+}
+
+void rnr_nak_timer(unsigned long data)
+{
+       struct rxe_qp *qp = (struct rxe_qp *)data;
+
+       pr_debug("rnr nak timer fired\n");
+       rxe_run_task(&qp->req.task, 1);
+}
+
+static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
+{
+       struct rxe_send_wqe *wqe = queue_head(qp->sq.queue);
+       unsigned long flags;
+
+       if (unlikely(qp->req.state == QP_STATE_DRAIN)) {
+               /* check to see if we are drained;
+                * state_lock used by requester and completer
+                */
+               spin_lock_irqsave(&qp->state_lock, flags);
+               do {
+                       if (qp->req.state != QP_STATE_DRAIN) {
+                               /* comp just finished */
+                               spin_unlock_irqrestore(&qp->state_lock,
+                                                      flags);
+                               break;
+                       }
+
+                       if (wqe && ((qp->req.wqe_index !=
+                               consumer_index(qp->sq.queue)) ||
+                               (wqe->state != wqe_state_posted))) {
+                               /* comp not done yet */
+                               spin_unlock_irqrestore(&qp->state_lock,
+                                                      flags);
+                               break;
+                       }
+
+                       qp->req.state = QP_STATE_DRAINED;
+                       spin_unlock_irqrestore(&qp->state_lock, flags);
+
+                       if (qp->ibqp.event_handler) {
+                               struct ib_event ev;
+
+                               ev.device = qp->ibqp.device;
+                               ev.element.qp = &qp->ibqp;
+                               ev.event = IB_EVENT_SQ_DRAINED;
+                               qp->ibqp.event_handler(&ev,
+                                       qp->ibqp.qp_context);
+                       }
+               } while (0);
+       }
+
+       if (qp->req.wqe_index == producer_index(qp->sq.queue))
+               return NULL;
+
+       wqe = addr_from_index(qp->sq.queue, qp->req.wqe_index);
+
+       if (unlikely((qp->req.state == QP_STATE_DRAIN ||
+                     qp->req.state == QP_STATE_DRAINED) &&
+                    (wqe->state != wqe_state_processing)))
+               return NULL;
+
+       if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
+                    (qp->req.wqe_index != consumer_index(qp->sq.queue)))) {
+               qp->req.wait_fence = 1;
+               return NULL;
+       }
+
+       wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
+       return wqe;
+}
+
+static int next_opcode_rc(struct rxe_qp *qp, unsigned opcode, int fits)
+{
+       switch (opcode) {
+       case IB_WR_RDMA_WRITE:
+               if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+                   qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_RC_RDMA_WRITE_LAST :
+                               IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_RC_RDMA_WRITE_ONLY :
+                               IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+                   qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+                               IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+                               IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+       case IB_WR_SEND:
+               if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+                   qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_RC_SEND_LAST :
+                               IB_OPCODE_RC_SEND_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_RC_SEND_ONLY :
+                               IB_OPCODE_RC_SEND_FIRST;
+
+       case IB_WR_SEND_WITH_IMM:
+               if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+                   qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE :
+                               IB_OPCODE_RC_SEND_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE :
+                               IB_OPCODE_RC_SEND_FIRST;
+
+       case IB_WR_RDMA_READ:
+               return IB_OPCODE_RC_RDMA_READ_REQUEST;
+
+       case IB_WR_ATOMIC_CMP_AND_SWP:
+               return IB_OPCODE_RC_COMPARE_SWAP;
+
+       case IB_WR_ATOMIC_FETCH_AND_ADD:
+               return IB_OPCODE_RC_FETCH_ADD;
+
+       case IB_WR_SEND_WITH_INV:
+               if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+                   qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+                       return fits ? IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE :
+                               IB_OPCODE_RC_SEND_MIDDLE;
+               else
+                       return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE :
+                               IB_OPCODE_RC_SEND_FIRST;
+       case IB_WR_REG_MR:
+       case IB_WR_LOCAL_INV:
+               return opcode;
+       }
+
+       return -EINVAL;
+}
+
+static int next_opcode_uc(struct rxe_qp *qp, unsigned opcode, int fits)
+{
+       switch (opcode) {
+       case IB_WR_RDMA_WRITE:
+               if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+                   qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_UC_RDMA_WRITE_LAST :
+                               IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_UC_RDMA_WRITE_ONLY :
+                               IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+                   qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+                               IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+                               IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+       case IB_WR_SEND:
+               if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+                   qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_UC_SEND_LAST :
+                               IB_OPCODE_UC_SEND_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_UC_SEND_ONLY :
+                               IB_OPCODE_UC_SEND_FIRST;
+
+       case IB_WR_SEND_WITH_IMM:
+               if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+                   qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+                       return fits ?
+                               IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE :
+                               IB_OPCODE_UC_SEND_MIDDLE;
+               else
+                       return fits ?
+                               IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE :
+                               IB_OPCODE_UC_SEND_FIRST;
+       }
+
+       return -EINVAL;
+}
+
+static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+                      unsigned opcode)
+{
+       int fits = (wqe->dma.resid <= qp->mtu);
+
+       switch (qp_type(qp)) {
+       case IB_QPT_RC:
+               return next_opcode_rc(qp, opcode, fits);
+
+       case IB_QPT_UC:
+               return next_opcode_uc(qp, opcode, fits);
+
+       case IB_QPT_SMI:
+       case IB_QPT_UD:
+       case IB_QPT_GSI:
+               switch (opcode) {
+               case IB_WR_SEND:
+                       return IB_OPCODE_UD_SEND_ONLY;
+
+               case IB_WR_SEND_WITH_IMM:
+                       return IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+               }
+               break;
+
+       default:
+               break;
+       }
+
+       return -EINVAL;
+}
+
+static inline int check_init_depth(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+       int depth;
+
+       if (wqe->has_rd_atomic)
+               return 0;
+
+       qp->req.need_rd_atomic = 1;
+       depth = atomic_dec_return(&qp->req.rd_atomic);
+
+       if (depth >= 0) {
+               qp->req.need_rd_atomic = 0;
+               wqe->has_rd_atomic = 1;
+               return 0;
+       }
+
+       atomic_inc(&qp->req.rd_atomic);
+       return -EAGAIN;
+}
+
+static inline int get_mtu(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       struct rxe_port *port;
+       struct rxe_av *av;
+
+       if ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC))
+               return qp->mtu;
+
+       av = &wqe->av;
+       port = &rxe->port;
+
+       return port->mtu_cap;
+}
+
+static struct sk_buff *init_req_packet(struct rxe_qp *qp,
+                                      struct rxe_send_wqe *wqe,
+                                      int opcode, int payload,
+                                      struct rxe_pkt_info *pkt)
+{
+       struct rxe_dev          *rxe = to_rdev(qp->ibqp.device);
+       struct rxe_port         *port = &rxe->port;
+       struct sk_buff          *skb;
+       struct rxe_send_wr      *ibwr = &wqe->wr;
+       struct rxe_av           *av;
+       int                     pad = (-payload) & 0x3;
+       int                     paylen;
+       int                     solicited;
+       u16                     pkey;
+       u32                     qp_num;
+       int                     ack_req;
+
+       /* length from start of bth to end of icrc */
+       paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE;
+
+       /* pkt->hdr, rxe, port_num and mask are initialized in ifc
+        * layer
+        */
+       pkt->opcode     = opcode;
+       pkt->qp         = qp;
+       pkt->psn        = qp->req.psn;
+       pkt->mask       = rxe_opcode[opcode].mask;
+       pkt->paylen     = paylen;
+       pkt->offset     = 0;
+       pkt->wqe        = wqe;
+
+       /* init skb */
+       av = rxe_get_av(pkt);
+       skb = rxe->ifc_ops->init_packet(rxe, av, paylen, pkt);
+       if (unlikely(!skb))
+               return NULL;
+
+       /* init bth */
+       solicited = (ibwr->send_flags & IB_SEND_SOLICITED) &&
+                       (pkt->mask & RXE_END_MASK) &&
+                       ((pkt->mask & (RXE_SEND_MASK)) ||
+                       (pkt->mask & (RXE_WRITE_MASK | RXE_IMMDT_MASK)) ==
+                       (RXE_WRITE_MASK | RXE_IMMDT_MASK));
+
+       pkey = (qp_type(qp) == IB_QPT_GSI) ?
+                port->pkey_tbl[ibwr->wr.ud.pkey_index] :
+                port->pkey_tbl[qp->attr.pkey_index];
+
+       qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn :
+                                        qp->attr.dest_qp_num;
+
+       ack_req = ((pkt->mask & RXE_END_MASK) ||
+               (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK));
+       if (ack_req)
+               qp->req.noack_pkts = 0;
+
+       bth_init(pkt, pkt->opcode, solicited, 0, pad, pkey, qp_num,
+                ack_req, pkt->psn);
+
+       /* init optional headers */
+       if (pkt->mask & RXE_RETH_MASK) {
+               reth_set_rkey(pkt, ibwr->wr.rdma.rkey);
+               reth_set_va(pkt, wqe->iova);
+               reth_set_len(pkt, wqe->dma.length);
+       }
+
+       if (pkt->mask & RXE_IMMDT_MASK)
+               immdt_set_imm(pkt, ibwr->ex.imm_data);
+
+       if (pkt->mask & RXE_IETH_MASK)
+               ieth_set_rkey(pkt, ibwr->ex.invalidate_rkey);
+
+       if (pkt->mask & RXE_ATMETH_MASK) {
+               atmeth_set_va(pkt, wqe->iova);
+               if (opcode == IB_OPCODE_RC_COMPARE_SWAP ||
+                   opcode == IB_OPCODE_RD_COMPARE_SWAP) {
+                       atmeth_set_swap_add(pkt, ibwr->wr.atomic.swap);
+                       atmeth_set_comp(pkt, ibwr->wr.atomic.compare_add);
+               } else {
+                       atmeth_set_swap_add(pkt, ibwr->wr.atomic.compare_add);
+               }
+               atmeth_set_rkey(pkt, ibwr->wr.atomic.rkey);
+       }
+
+       if (pkt->mask & RXE_DETH_MASK) {
+               if (qp->ibqp.qp_num == 1)
+                       deth_set_qkey(pkt, GSI_QKEY);
+               else
+                       deth_set_qkey(pkt, ibwr->wr.ud.remote_qkey);
+               deth_set_sqp(pkt, qp->ibqp.qp_num);
+       }
+
+       return skb;
+}
+
+static int fill_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+                      struct rxe_pkt_info *pkt, struct sk_buff *skb,
+                      int paylen)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       u32 crc = 0;
+       u32 *p;
+       int err;
+
+       err = rxe->ifc_ops->prepare(rxe, pkt, skb, &crc);
+       if (err)
+               return err;
+
+       if (pkt->mask & RXE_WRITE_OR_SEND) {
+               if (wqe->wr.send_flags & IB_SEND_INLINE) {
+                       u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset];
+
+                       crc = crc32_le(crc, tmp, paylen);
+
+                       memcpy(payload_addr(pkt), tmp, paylen);
+
+                       wqe->dma.resid -= paylen;
+                       wqe->dma.sge_offset += paylen;
+               } else {
+                       err = copy_data(rxe, qp->pd, 0, &wqe->dma,
+                                       payload_addr(pkt), paylen,
+                                       from_mem_obj,
+                                       &crc);
+                       if (err)
+                               return err;
+               }
+       }
+       p = payload_addr(pkt) + paylen + bth_pad(pkt);
+
+       *p = ~crc;
+
+       return 0;
+}
+
+static void update_wqe_state(struct rxe_qp *qp,
+                            struct rxe_send_wqe *wqe,
+                            struct rxe_pkt_info *pkt,
+                            enum wqe_state *prev_state)
+{
+       enum wqe_state prev_state_ = wqe->state;
+
+       if (pkt->mask & RXE_END_MASK) {
+               if (qp_type(qp) == IB_QPT_RC)
+                       wqe->state = wqe_state_pending;
+       } else {
+               wqe->state = wqe_state_processing;
+       }
+
+       *prev_state = prev_state_;
+}
+
+static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+                        struct rxe_pkt_info *pkt, int payload)
+{
+       /* number of packets left to send including current one */
+       int num_pkt = (wqe->dma.resid + payload + qp->mtu - 1) / qp->mtu;
+
+       /* handle zero length packet case */
+       if (num_pkt == 0)
+               num_pkt = 1;
+
+       if (pkt->mask & RXE_START_MASK) {
+               wqe->first_psn = qp->req.psn;
+               wqe->last_psn = (qp->req.psn + num_pkt - 1) & BTH_PSN_MASK;
+       }
+
+       if (pkt->mask & RXE_READ_MASK)
+               qp->req.psn = (wqe->first_psn + num_pkt) & BTH_PSN_MASK;
+       else
+               qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK;
+
+       qp->req.opcode = pkt->opcode;
+
+
+       if (pkt->mask & RXE_END_MASK)
+               qp->req.wqe_index = next_index(qp->sq.queue, qp->req.wqe_index);
+
+       qp->need_req_skb = 0;
+
+       if (qp->qp_timeout_jiffies && !timer_pending(&qp->retrans_timer))
+               mod_timer(&qp->retrans_timer,
+                         jiffies + qp->qp_timeout_jiffies);
+}
+
+int rxe_requester(void *arg)
+{
+       struct rxe_qp *qp = (struct rxe_qp *)arg;
+       struct rxe_pkt_info pkt;
+       struct sk_buff *skb;
+       struct rxe_send_wqe *wqe;
+       unsigned mask;
+       int payload;
+       int mtu;
+       int opcode;
+       int ret;
+       enum wqe_state prev_state;
+
+next_wqe:
+       if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR))
+               goto exit;
+
+       if (unlikely(qp->req.state == QP_STATE_RESET)) {
+               qp->req.wqe_index = consumer_index(qp->sq.queue);
+               qp->req.opcode = -1;
+               qp->req.need_rd_atomic = 0;
+               qp->req.wait_psn = 0;
+               qp->req.need_retry = 0;
+               goto exit;
+       }
+
+       if (unlikely(qp->req.need_retry)) {
+               req_retry(qp);
+               qp->req.need_retry = 0;
+       }
+
+       wqe = req_next_wqe(qp);
+       if (unlikely(!wqe))
+               goto exit;
+
+       if (wqe->mask & WR_REG_MASK) {
+               if (wqe->wr.opcode == IB_WR_LOCAL_INV) {
+                       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+                       struct rxe_mem *rmr;
+
+                       rmr = rxe_pool_get_index(&rxe->mr_pool,
+                                                wqe->wr.ex.invalidate_rkey >> 8);
+                       if (!rmr) {
+                               pr_err("No mr for key %#x\n", wqe->wr.ex.invalidate_rkey);
+                               wqe->state = wqe_state_error;
+                               wqe->status = IB_WC_MW_BIND_ERR;
+                               goto exit;
+                       }
+                       rmr->state = RXE_MEM_STATE_FREE;
+                       wqe->state = wqe_state_done;
+                       wqe->status = IB_WC_SUCCESS;
+               } else if (wqe->wr.opcode == IB_WR_REG_MR) {
+                       struct rxe_mem *rmr = to_rmr(wqe->wr.wr.reg.mr);
+
+                       rmr->state = RXE_MEM_STATE_VALID;
+                       rmr->access = wqe->wr.wr.reg.access;
+                       rmr->lkey = wqe->wr.wr.reg.key;
+                       rmr->rkey = wqe->wr.wr.reg.key;
+                       wqe->state = wqe_state_done;
+                       wqe->status = IB_WC_SUCCESS;
+               } else {
+                       goto exit;
+               }
+               qp->req.wqe_index = next_index(qp->sq.queue,
+                                               qp->req.wqe_index);
+               goto next_wqe;
+       }
+
+       if (unlikely(qp_type(qp) == IB_QPT_RC &&
+                    qp->req.psn > (qp->comp.psn + RXE_MAX_UNACKED_PSNS))) {
+               qp->req.wait_psn = 1;
+               goto exit;
+       }
+
+       /* Limit the number of inflight SKBs per QP */
+       if (unlikely(atomic_read(&qp->skb_out) >
+                    RXE_INFLIGHT_SKBS_PER_QP_HIGH)) {
+               qp->need_req_skb = 1;
+               goto exit;
+       }
+
+       opcode = next_opcode(qp, wqe, wqe->wr.opcode);
+       if (unlikely(opcode < 0)) {
+               wqe->status = IB_WC_LOC_QP_OP_ERR;
+               goto exit;
+       }
+
+       mask = rxe_opcode[opcode].mask;
+       if (unlikely(mask & RXE_READ_OR_ATOMIC)) {
+               if (check_init_depth(qp, wqe))
+                       goto exit;
+       }
+
+       mtu = get_mtu(qp, wqe);
+       payload = (mask & RXE_WRITE_OR_SEND) ? wqe->dma.resid : 0;
+       if (payload > mtu) {
+               if (qp_type(qp) == IB_QPT_UD) {
+                       /* C10-93.1.1: If the total sum of all the buffer lengths specified for a
+                        * UD message exceeds the MTU of the port as returned by QueryHCA, the CI
+                        * shall not emit any packets for this message. Further, the CI shall not
+                        * generate an error due to this condition.
+                        */
+
+                       /* fake a successful UD send */
+                       wqe->first_psn = qp->req.psn;
+                       wqe->last_psn = qp->req.psn;
+                       qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK;
+                       qp->req.opcode = IB_OPCODE_UD_SEND_ONLY;
+                       qp->req.wqe_index = next_index(qp->sq.queue,
+                                                      qp->req.wqe_index);
+                       wqe->state = wqe_state_done;
+                       wqe->status = IB_WC_SUCCESS;
+                       goto complete;
+               }
+               payload = mtu;
+       }
+
+       skb = init_req_packet(qp, wqe, opcode, payload, &pkt);
+       if (unlikely(!skb)) {
+               pr_err("Failed allocating skb\n");
+               goto err;
+       }
+
+       if (fill_packet(qp, wqe, &pkt, skb, payload)) {
+               pr_debug("Error during fill packet\n");
+               goto err;
+       }
+
+       update_wqe_state(qp, wqe, &pkt, &prev_state);
+       ret = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, &pkt, skb);
+       if (ret) {
+               qp->need_req_skb = 1;
+               kfree_skb(skb);
+
+               wqe->state = prev_state;
+
+               if (ret == -EAGAIN) {
+                       rxe_run_task(&qp->req.task, 1);
+                       goto exit;
+               }
+
+               goto err;
+       }
+
+       update_state(qp, wqe, &pkt, payload);
+
+       goto next_wqe;
+
+err:
+       kfree_skb(skb);
+       wqe->status = IB_WC_LOC_PROT_ERR;
+       wqe->state = wqe_state_error;
+
+complete:
+       if (qp_type(qp) != IB_QPT_RC) {
+               while (rxe_completer(qp) == 0)
+                       ;
+       }
+
+       return 0;
+
+exit:
+       return -EAGAIN;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c

new file mode 100644 (file)

index 0000000..ebb03b4
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -0,0 +1,1380 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+enum resp_states {
+       RESPST_NONE,
+       RESPST_GET_REQ,
+       RESPST_CHK_PSN,
+       RESPST_CHK_OP_SEQ,
+       RESPST_CHK_OP_VALID,
+       RESPST_CHK_RESOURCE,
+       RESPST_CHK_LENGTH,
+       RESPST_CHK_RKEY,
+       RESPST_EXECUTE,
+       RESPST_READ_REPLY,
+       RESPST_COMPLETE,
+       RESPST_ACKNOWLEDGE,
+       RESPST_CLEANUP,
+       RESPST_DUPLICATE_REQUEST,
+       RESPST_ERR_MALFORMED_WQE,
+       RESPST_ERR_UNSUPPORTED_OPCODE,
+       RESPST_ERR_MISALIGNED_ATOMIC,
+       RESPST_ERR_PSN_OUT_OF_SEQ,
+       RESPST_ERR_MISSING_OPCODE_FIRST,
+       RESPST_ERR_MISSING_OPCODE_LAST_C,
+       RESPST_ERR_MISSING_OPCODE_LAST_D1E,
+       RESPST_ERR_TOO_MANY_RDMA_ATM_REQ,
+       RESPST_ERR_RNR,
+       RESPST_ERR_RKEY_VIOLATION,
+       RESPST_ERR_LENGTH,
+       RESPST_ERR_CQ_OVERFLOW,
+       RESPST_ERROR,
+       RESPST_RESET,
+       RESPST_DONE,
+       RESPST_EXIT,
+};
+
+static char *resp_state_name[] = {
+       [RESPST_NONE]                           = "NONE",
+       [RESPST_GET_REQ]                        = "GET_REQ",
+       [RESPST_CHK_PSN]                        = "CHK_PSN",
+       [RESPST_CHK_OP_SEQ]                     = "CHK_OP_SEQ",
+       [RESPST_CHK_OP_VALID]                   = "CHK_OP_VALID",
+       [RESPST_CHK_RESOURCE]                   = "CHK_RESOURCE",
+       [RESPST_CHK_LENGTH]                     = "CHK_LENGTH",
+       [RESPST_CHK_RKEY]                       = "CHK_RKEY",
+       [RESPST_EXECUTE]                        = "EXECUTE",
+       [RESPST_READ_REPLY]                     = "READ_REPLY",
+       [RESPST_COMPLETE]                       = "COMPLETE",
+       [RESPST_ACKNOWLEDGE]                    = "ACKNOWLEDGE",
+       [RESPST_CLEANUP]                        = "CLEANUP",
+       [RESPST_DUPLICATE_REQUEST]              = "DUPLICATE_REQUEST",
+       [RESPST_ERR_MALFORMED_WQE]              = "ERR_MALFORMED_WQE",
+       [RESPST_ERR_UNSUPPORTED_OPCODE]         = "ERR_UNSUPPORTED_OPCODE",
+       [RESPST_ERR_MISALIGNED_ATOMIC]          = "ERR_MISALIGNED_ATOMIC",
+       [RESPST_ERR_PSN_OUT_OF_SEQ]             = "ERR_PSN_OUT_OF_SEQ",
+       [RESPST_ERR_MISSING_OPCODE_FIRST]       = "ERR_MISSING_OPCODE_FIRST",
+       [RESPST_ERR_MISSING_OPCODE_LAST_C]      = "ERR_MISSING_OPCODE_LAST_C",
+       [RESPST_ERR_MISSING_OPCODE_LAST_D1E]    = "ERR_MISSING_OPCODE_LAST_D1E",
+       [RESPST_ERR_TOO_MANY_RDMA_ATM_REQ]      = "ERR_TOO_MANY_RDMA_ATM_REQ",
+       [RESPST_ERR_RNR]                        = "ERR_RNR",
+       [RESPST_ERR_RKEY_VIOLATION]             = "ERR_RKEY_VIOLATION",
+       [RESPST_ERR_LENGTH]                     = "ERR_LENGTH",
+       [RESPST_ERR_CQ_OVERFLOW]                = "ERR_CQ_OVERFLOW",
+       [RESPST_ERROR]                          = "ERROR",
+       [RESPST_RESET]                          = "RESET",
+       [RESPST_DONE]                           = "DONE",
+       [RESPST_EXIT]                           = "EXIT",
+};
+
+/* rxe_recv calls here to add a request packet to the input queue */
+void rxe_resp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
+                       struct sk_buff *skb)
+{
+       int must_sched;
+       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+       skb_queue_tail(&qp->req_pkts, skb);
+
+       must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) ||
+                       (skb_queue_len(&qp->req_pkts) > 1);
+
+       rxe_run_task(&qp->resp.task, must_sched);
+}
+
+static inline enum resp_states get_req(struct rxe_qp *qp,
+                                      struct rxe_pkt_info **pkt_p)
+{
+       struct sk_buff *skb;
+
+       if (qp->resp.state == QP_STATE_ERROR) {
+               skb = skb_dequeue(&qp->req_pkts);
+               if (skb) {
+                       /* drain request packet queue */
+                       rxe_drop_ref(qp);
+                       kfree_skb(skb);
+                       return RESPST_GET_REQ;
+               }
+
+               /* go drain recv wr queue */
+               return RESPST_CHK_RESOURCE;
+       }
+
+       skb = skb_peek(&qp->req_pkts);
+       if (!skb)
+               return RESPST_EXIT;
+
+       *pkt_p = SKB_TO_PKT(skb);
+
+       return (qp->resp.res) ? RESPST_READ_REPLY : RESPST_CHK_PSN;
+}
+
+static enum resp_states check_psn(struct rxe_qp *qp,
+                                 struct rxe_pkt_info *pkt)
+{
+       int diff = psn_compare(pkt->psn, qp->resp.psn);
+
+       switch (qp_type(qp)) {
+       case IB_QPT_RC:
+               if (diff > 0) {
+                       if (qp->resp.sent_psn_nak)
+                               return RESPST_CLEANUP;
+
+                       qp->resp.sent_psn_nak = 1;
+                       return RESPST_ERR_PSN_OUT_OF_SEQ;
+
+               } else if (diff < 0) {
+                       return RESPST_DUPLICATE_REQUEST;
+               }
+
+               if (qp->resp.sent_psn_nak)
+                       qp->resp.sent_psn_nak = 0;
+
+               break;
+
+       case IB_QPT_UC:
+               if (qp->resp.drop_msg || diff != 0) {
+                       if (pkt->mask & RXE_START_MASK) {
+                               qp->resp.drop_msg = 0;
+                               return RESPST_CHK_OP_SEQ;
+                       }
+
+                       qp->resp.drop_msg = 1;
+                       return RESPST_CLEANUP;
+               }
+               break;
+       default:
+               break;
+       }
+
+       return RESPST_CHK_OP_SEQ;
+}
+
+static enum resp_states check_op_seq(struct rxe_qp *qp,
+                                    struct rxe_pkt_info *pkt)
+{
+       switch (qp_type(qp)) {
+       case IB_QPT_RC:
+               switch (qp->resp.opcode) {
+               case IB_OPCODE_RC_SEND_FIRST:
+               case IB_OPCODE_RC_SEND_MIDDLE:
+                       switch (pkt->opcode) {
+                       case IB_OPCODE_RC_SEND_MIDDLE:
+                       case IB_OPCODE_RC_SEND_LAST:
+                       case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE:
+                       case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE:
+                               return RESPST_CHK_OP_VALID;
+                       default:
+                               return RESPST_ERR_MISSING_OPCODE_LAST_C;
+                       }
+
+               case IB_OPCODE_RC_RDMA_WRITE_FIRST:
+               case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+                       switch (pkt->opcode) {
+                       case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+                       case IB_OPCODE_RC_RDMA_WRITE_LAST:
+                       case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+                               return RESPST_CHK_OP_VALID;
+                       default:
+                               return RESPST_ERR_MISSING_OPCODE_LAST_C;
+                       }
+
+               default:
+                       switch (pkt->opcode) {
+                       case IB_OPCODE_RC_SEND_MIDDLE:
+                       case IB_OPCODE_RC_SEND_LAST:
+                       case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE:
+                       case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE:
+                       case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+                       case IB_OPCODE_RC_RDMA_WRITE_LAST:
+                       case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+                               return RESPST_ERR_MISSING_OPCODE_FIRST;
+                       default:
+                               return RESPST_CHK_OP_VALID;
+                       }
+               }
+               break;
+
+       case IB_QPT_UC:
+               switch (qp->resp.opcode) {
+               case IB_OPCODE_UC_SEND_FIRST:
+               case IB_OPCODE_UC_SEND_MIDDLE:
+                       switch (pkt->opcode) {
+                       case IB_OPCODE_UC_SEND_MIDDLE:
+                       case IB_OPCODE_UC_SEND_LAST:
+                       case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE:
+                               return RESPST_CHK_OP_VALID;
+                       default:
+                               return RESPST_ERR_MISSING_OPCODE_LAST_D1E;
+                       }
+
+               case IB_OPCODE_UC_RDMA_WRITE_FIRST:
+               case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+                       switch (pkt->opcode) {
+                       case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+                       case IB_OPCODE_UC_RDMA_WRITE_LAST:
+                       case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+                               return RESPST_CHK_OP_VALID;
+                       default:
+                               return RESPST_ERR_MISSING_OPCODE_LAST_D1E;
+                       }
+
+               default:
+                       switch (pkt->opcode) {
+                       case IB_OPCODE_UC_SEND_MIDDLE:
+                       case IB_OPCODE_UC_SEND_LAST:
+                       case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE:
+                       case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+                       case IB_OPCODE_UC_RDMA_WRITE_LAST:
+                       case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+                               qp->resp.drop_msg = 1;
+                               return RESPST_CLEANUP;
+                       default:
+                               return RESPST_CHK_OP_VALID;
+                       }
+               }
+               break;
+
+       default:
+               return RESPST_CHK_OP_VALID;
+       }
+}
+
+static enum resp_states check_op_valid(struct rxe_qp *qp,
+                                      struct rxe_pkt_info *pkt)
+{
+       switch (qp_type(qp)) {
+       case IB_QPT_RC:
+               if (((pkt->mask & RXE_READ_MASK) &&
+                    !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) ||
+                   ((pkt->mask & RXE_WRITE_MASK) &&
+                    !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) ||
+                   ((pkt->mask & RXE_ATOMIC_MASK) &&
+                    !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) {
+                       return RESPST_ERR_UNSUPPORTED_OPCODE;
+               }
+
+               break;
+
+       case IB_QPT_UC:
+               if ((pkt->mask & RXE_WRITE_MASK) &&
+                   !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) {
+                       qp->resp.drop_msg = 1;
+                       return RESPST_CLEANUP;
+               }
+
+               break;
+
+       case IB_QPT_UD:
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               break;
+
+       default:
+               WARN_ON(1);
+               break;
+       }
+
+       return RESPST_CHK_RESOURCE;
+}
+
+static enum resp_states get_srq_wqe(struct rxe_qp *qp)
+{
+       struct rxe_srq *srq = qp->srq;
+       struct rxe_queue *q = srq->rq.queue;
+       struct rxe_recv_wqe *wqe;
+       struct ib_event ev;
+
+       if (srq->error)
+               return RESPST_ERR_RNR;
+
+       spin_lock_bh(&srq->rq.consumer_lock);
+
+       wqe = queue_head(q);
+       if (!wqe) {
+               spin_unlock_bh(&srq->rq.consumer_lock);
+               return RESPST_ERR_RNR;
+       }
+
+       /* note kernel and user space recv wqes have same size */
+       memcpy(&qp->resp.srq_wqe, wqe, sizeof(qp->resp.srq_wqe));
+
+       qp->resp.wqe = &qp->resp.srq_wqe.wqe;
+       advance_consumer(q);
+
+       if (srq->limit && srq->ibsrq.event_handler &&
+           (queue_count(q) < srq->limit)) {
+               srq->limit = 0;
+               goto event;
+       }
+
+       spin_unlock_bh(&srq->rq.consumer_lock);
+       return RESPST_CHK_LENGTH;
+
+event:
+       spin_unlock_bh(&srq->rq.consumer_lock);
+       ev.device = qp->ibqp.device;
+       ev.element.srq = qp->ibqp.srq;
+       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+       srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context);
+       return RESPST_CHK_LENGTH;
+}
+
+static enum resp_states check_resource(struct rxe_qp *qp,
+                                      struct rxe_pkt_info *pkt)
+{
+       struct rxe_srq *srq = qp->srq;
+
+       if (qp->resp.state == QP_STATE_ERROR) {
+               if (qp->resp.wqe) {
+                       qp->resp.status = IB_WC_WR_FLUSH_ERR;
+                       return RESPST_COMPLETE;
+               } else if (!srq) {
+                       qp->resp.wqe = queue_head(qp->rq.queue);
+                       if (qp->resp.wqe) {
+                               qp->resp.status = IB_WC_WR_FLUSH_ERR;
+                               return RESPST_COMPLETE;
+                       } else {
+                               return RESPST_EXIT;
+                       }
+               } else {
+                       return RESPST_EXIT;
+               }
+       }
+
+       if (pkt->mask & RXE_READ_OR_ATOMIC) {
+               /* it is the requesters job to not send
+                * too many read/atomic ops, we just
+                * recycle the responder resource queue
+                */
+               if (likely(qp->attr.max_rd_atomic > 0))
+                       return RESPST_CHK_LENGTH;
+               else
+                       return RESPST_ERR_TOO_MANY_RDMA_ATM_REQ;
+       }
+
+       if (pkt->mask & RXE_RWR_MASK) {
+               if (srq)
+                       return get_srq_wqe(qp);
+
+               qp->resp.wqe = queue_head(qp->rq.queue);
+               return (qp->resp.wqe) ? RESPST_CHK_LENGTH : RESPST_ERR_RNR;
+       }
+
+       return RESPST_CHK_LENGTH;
+}
+
+static enum resp_states check_length(struct rxe_qp *qp,
+                                    struct rxe_pkt_info *pkt)
+{
+       switch (qp_type(qp)) {
+       case IB_QPT_RC:
+               return RESPST_CHK_RKEY;
+
+       case IB_QPT_UC:
+               return RESPST_CHK_RKEY;
+
+       default:
+               return RESPST_CHK_RKEY;
+       }
+}
+
+static enum resp_states check_rkey(struct rxe_qp *qp,
+                                  struct rxe_pkt_info *pkt)
+{
+       struct rxe_mem *mem;
+       u64 va;
+       u32 rkey;
+       u32 resid;
+       u32 pktlen;
+       int mtu = qp->mtu;
+       enum resp_states state;
+       int access;
+
+       if (pkt->mask & (RXE_READ_MASK | RXE_WRITE_MASK)) {
+               if (pkt->mask & RXE_RETH_MASK) {
+                       qp->resp.va = reth_va(pkt);
+                       qp->resp.rkey = reth_rkey(pkt);
+                       qp->resp.resid = reth_len(pkt);
+               }
+               access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
+                                                    : IB_ACCESS_REMOTE_WRITE;
+       } else if (pkt->mask & RXE_ATOMIC_MASK) {
+               qp->resp.va = atmeth_va(pkt);
+               qp->resp.rkey = atmeth_rkey(pkt);
+               qp->resp.resid = sizeof(u64);
+               access = IB_ACCESS_REMOTE_ATOMIC;
+       } else {
+               return RESPST_EXECUTE;
+       }
+
+       va      = qp->resp.va;
+       rkey    = qp->resp.rkey;
+       resid   = qp->resp.resid;
+       pktlen  = payload_size(pkt);
+
+       mem = lookup_mem(qp->pd, access, rkey, lookup_remote);
+       if (!mem) {
+               state = RESPST_ERR_RKEY_VIOLATION;
+               goto err1;
+       }
+
+       if (unlikely(mem->state == RXE_MEM_STATE_FREE)) {
+               state = RESPST_ERR_RKEY_VIOLATION;
+               goto err1;
+       }
+
+       if (mem_check_range(mem, va, resid)) {
+               state = RESPST_ERR_RKEY_VIOLATION;
+               goto err2;
+       }
+
+       if (pkt->mask & RXE_WRITE_MASK)  {
+               if (resid > mtu) {
+                       if (pktlen != mtu || bth_pad(pkt)) {
+                               state = RESPST_ERR_LENGTH;
+                               goto err2;
+                       }
+
+                       resid = mtu;
+               } else {
+                       if (pktlen != resid) {
+                               state = RESPST_ERR_LENGTH;
+                               goto err2;
+                       }
+                       if ((bth_pad(pkt) != (0x3 & (-resid)))) {
+                               /* This case may not be exactly that
+                                * but nothing else fits.
+                                */
+                               state = RESPST_ERR_LENGTH;
+                               goto err2;
+                       }
+               }
+       }
+
+       WARN_ON(qp->resp.mr);
+
+       qp->resp.mr = mem;
+       return RESPST_EXECUTE;
+
+err2:
+       rxe_drop_ref(mem);
+err1:
+       return state;
+}
+
+static enum resp_states send_data_in(struct rxe_qp *qp, void *data_addr,
+                                    int data_len)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+       err = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma,
+                       data_addr, data_len, to_mem_obj, NULL);
+       if (unlikely(err))
+               return (err == -ENOSPC) ? RESPST_ERR_LENGTH
+                                       : RESPST_ERR_MALFORMED_WQE;
+
+       return RESPST_NONE;
+}
+
+static enum resp_states write_data_in(struct rxe_qp *qp,
+                                     struct rxe_pkt_info *pkt)
+{
+       enum resp_states rc = RESPST_NONE;
+       int     err;
+       int data_len = payload_size(pkt);
+
+       err = rxe_mem_copy(qp->resp.mr, qp->resp.va, payload_addr(pkt),
+                          data_len, to_mem_obj, NULL);
+       if (err) {
+               rc = RESPST_ERR_RKEY_VIOLATION;
+               goto out;
+       }
+
+       qp->resp.va += data_len;
+       qp->resp.resid -= data_len;
+
+out:
+       return rc;
+}
+
+/* Guarantee atomicity of atomic operations at the machine level. */
+static DEFINE_SPINLOCK(atomic_ops_lock);
+
+static enum resp_states process_atomic(struct rxe_qp *qp,
+                                      struct rxe_pkt_info *pkt)
+{
+       u64 iova = atmeth_va(pkt);
+       u64 *vaddr;
+       enum resp_states ret;
+       struct rxe_mem *mr = qp->resp.mr;
+
+       if (mr->state != RXE_MEM_STATE_VALID) {
+               ret = RESPST_ERR_RKEY_VIOLATION;
+               goto out;
+       }
+
+       vaddr = iova_to_vaddr(mr, iova, sizeof(u64));
+
+       /* check vaddr is 8 bytes aligned. */
+       if (!vaddr || (uintptr_t)vaddr & 7) {
+               ret = RESPST_ERR_MISALIGNED_ATOMIC;
+               goto out;
+       }
+
+       spin_lock_bh(&atomic_ops_lock);
+
+       qp->resp.atomic_orig = *vaddr;
+
+       if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP ||
+           pkt->opcode == IB_OPCODE_RD_COMPARE_SWAP) {
+               if (*vaddr == atmeth_comp(pkt))
+                       *vaddr = atmeth_swap_add(pkt);
+       } else {
+               *vaddr += atmeth_swap_add(pkt);
+       }
+
+       spin_unlock_bh(&atomic_ops_lock);
+
+       ret = RESPST_NONE;
+out:
+       return ret;
+}
+
+static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
+                                         struct rxe_pkt_info *pkt,
+                                         struct rxe_pkt_info *ack,
+                                         int opcode,
+                                         int payload,
+                                         u32 psn,
+                                         u8 syndrome,
+                                         u32 *crcp)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       struct sk_buff *skb;
+       u32 crc = 0;
+       u32 *p;
+       int paylen;
+       int pad;
+       int err;
+
+       /*
+        * allocate packet
+        */
+       pad = (-payload) & 0x3;
+       paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE;
+
+       skb = rxe->ifc_ops->init_packet(rxe, &qp->pri_av, paylen, ack);
+       if (!skb)
+               return NULL;
+
+       ack->qp = qp;
+       ack->opcode = opcode;
+       ack->mask = rxe_opcode[opcode].mask;
+       ack->offset = pkt->offset;
+       ack->paylen = paylen;
+
+       /* fill in bth using the request packet headers */
+       memcpy(ack->hdr, pkt->hdr, pkt->offset + RXE_BTH_BYTES);
+
+       bth_set_opcode(ack, opcode);
+       bth_set_qpn(ack, qp->attr.dest_qp_num);
+       bth_set_pad(ack, pad);
+       bth_set_se(ack, 0);
+       bth_set_psn(ack, psn);
+       bth_set_ack(ack, 0);
+       ack->psn = psn;
+
+       if (ack->mask & RXE_AETH_MASK) {
+               aeth_set_syn(ack, syndrome);
+               aeth_set_msn(ack, qp->resp.msn);
+       }
+
+       if (ack->mask & RXE_ATMACK_MASK)
+               atmack_set_orig(ack, qp->resp.atomic_orig);
+
+       err = rxe->ifc_ops->prepare(rxe, ack, skb, &crc);
+       if (err) {
+               kfree_skb(skb);
+               return NULL;
+       }
+
+       if (crcp) {
+               /* CRC computation will be continued by the caller */
+               *crcp = crc;
+       } else {
+               p = payload_addr(ack) + payload + bth_pad(ack);
+               *p = ~crc;
+       }
+
+       return skb;
+}
+
+/* RDMA read response. If res is not NULL, then we have a current RDMA request
+ * being processed or replayed.
+ */
+static enum resp_states read_reply(struct rxe_qp *qp,
+                                  struct rxe_pkt_info *req_pkt)
+{
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       struct rxe_pkt_info ack_pkt;
+       struct sk_buff *skb;
+       int mtu = qp->mtu;
+       enum resp_states state;
+       int payload;
+       int opcode;
+       int err;
+       struct resp_res *res = qp->resp.res;
+       u32 icrc;
+       u32 *p;
+
+       if (!res) {
+               /* This is the first time we process that request. Get a
+                * resource
+                */
+               res = &qp->resp.resources[qp->resp.res_head];
+
+               free_rd_atomic_resource(qp, res);
+               rxe_advance_resp_resource(qp);
+
+               res->type               = RXE_READ_MASK;
+
+               res->read.va            = qp->resp.va;
+               res->read.va_org        = qp->resp.va;
+
+               res->first_psn          = req_pkt->psn;
+               res->last_psn           = req_pkt->psn +
+                                         (reth_len(req_pkt) + mtu - 1) /
+                                         mtu - 1;
+               res->cur_psn            = req_pkt->psn;
+
+               res->read.resid         = qp->resp.resid;
+               res->read.length        = qp->resp.resid;
+               res->read.rkey          = qp->resp.rkey;
+
+               /* note res inherits the reference to mr from qp */
+               res->read.mr            = qp->resp.mr;
+               qp->resp.mr             = NULL;
+
+               qp->resp.res            = res;
+               res->state              = rdatm_res_state_new;
+       }
+
+       if (res->state == rdatm_res_state_new) {
+               if (res->read.resid <= mtu)
+                       opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY;
+               else
+                       opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST;
+       } else {
+               if (res->read.resid > mtu)
+                       opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE;
+               else
+                       opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST;
+       }
+
+       res->state = rdatm_res_state_next;
+
+       payload = min_t(int, res->read.resid, mtu);
+
+       skb = prepare_ack_packet(qp, req_pkt, &ack_pkt, opcode, payload,
+                                res->cur_psn, AETH_ACK_UNLIMITED, &icrc);
+       if (!skb)
+               return RESPST_ERR_RNR;
+
+       err = rxe_mem_copy(res->read.mr, res->read.va, payload_addr(&ack_pkt),
+                          payload, from_mem_obj, &icrc);
+       if (err)
+               pr_err("Failed copying memory\n");
+
+       p = payload_addr(&ack_pkt) + payload + bth_pad(&ack_pkt);
+       *p = ~icrc;
+
+       err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+       if (err) {
+               pr_err("Failed sending RDMA reply.\n");
+               kfree_skb(skb);
+               return RESPST_ERR_RNR;
+       }
+
+       res->read.va += payload;
+       res->read.resid -= payload;
+       res->cur_psn = (res->cur_psn + 1) & BTH_PSN_MASK;
+
+       if (res->read.resid > 0) {
+               state = RESPST_DONE;
+       } else {
+               qp->resp.res = NULL;
+               qp->resp.opcode = -1;
+               qp->resp.psn = res->cur_psn;
+               state = RESPST_CLEANUP;
+       }
+
+       return state;
+}
+
+/* Executes a new request. A retried request never reach that function (send
+ * and writes are discarded, and reads and atomics are retried elsewhere.
+ */
+static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
+{
+       enum resp_states err;
+
+       if (pkt->mask & RXE_SEND_MASK) {
+               if (qp_type(qp) == IB_QPT_UD ||
+                   qp_type(qp) == IB_QPT_SMI ||
+                   qp_type(qp) == IB_QPT_GSI) {
+                       union rdma_network_hdr hdr;
+                       struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+                       memset(&hdr, 0, sizeof(hdr));
+                       if (skb->protocol == htons(ETH_P_IP))
+                               memcpy(&hdr.roce4grh, ip_hdr(skb), sizeof(hdr.roce4grh));
+                       else if (skb->protocol == htons(ETH_P_IPV6))
+                               memcpy(&hdr.ibgrh, ipv6_hdr(skb), sizeof(hdr.ibgrh));
+
+                       err = send_data_in(qp, &hdr, sizeof(hdr));
+                       if (err)
+                               return err;
+               }
+               err = send_data_in(qp, payload_addr(pkt), payload_size(pkt));
+               if (err)
+                       return err;
+       } else if (pkt->mask & RXE_WRITE_MASK) {
+               err = write_data_in(qp, pkt);
+               if (err)
+                       return err;
+       } else if (pkt->mask & RXE_READ_MASK) {
+               /* For RDMA Read we can increment the msn now. See C9-148. */
+               qp->resp.msn++;
+               return RESPST_READ_REPLY;
+       } else if (pkt->mask & RXE_ATOMIC_MASK) {
+               err = process_atomic(qp, pkt);
+               if (err)
+                       return err;
+       } else
+               /* Unreachable */
+               WARN_ON(1);
+
+       /* We successfully processed this new request. */
+       qp->resp.msn++;
+
+       /* next expected psn, read handles this separately */
+       qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+
+       qp->resp.opcode = pkt->opcode;
+       qp->resp.status = IB_WC_SUCCESS;
+
+       if (pkt->mask & RXE_COMP_MASK)
+               return RESPST_COMPLETE;
+       else if (qp_type(qp) == IB_QPT_RC)
+               return RESPST_ACKNOWLEDGE;
+       else
+               return RESPST_CLEANUP;
+}
+
+static enum resp_states do_complete(struct rxe_qp *qp,
+                                   struct rxe_pkt_info *pkt)
+{
+       struct rxe_cqe cqe;
+       struct ib_wc *wc = &cqe.ibwc;
+       struct ib_uverbs_wc *uwc = &cqe.uibwc;
+       struct rxe_recv_wqe *wqe = qp->resp.wqe;
+
+       if (unlikely(!wqe))
+               return RESPST_CLEANUP;
+
+       memset(&cqe, 0, sizeof(cqe));
+
+       wc->wr_id               = wqe->wr_id;
+       wc->status              = qp->resp.status;
+       wc->qp                  = &qp->ibqp;
+
+       /* fields after status are not required for errors */
+       if (wc->status == IB_WC_SUCCESS) {
+               wc->opcode = (pkt->mask & RXE_IMMDT_MASK &&
+                               pkt->mask & RXE_WRITE_MASK) ?
+                                       IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
+               wc->vendor_err = 0;
+               wc->byte_len = wqe->dma.length - wqe->dma.resid;
+
+               /* fields after byte_len are different between kernel and user
+                * space
+                */
+               if (qp->rcq->is_user) {
+                       uwc->wc_flags = IB_WC_GRH;
+
+                       if (pkt->mask & RXE_IMMDT_MASK) {
+                               uwc->wc_flags |= IB_WC_WITH_IMM;
+                               uwc->ex.imm_data =
+                                       (__u32 __force)immdt_imm(pkt);
+                       }
+
+                       if (pkt->mask & RXE_IETH_MASK) {
+                               uwc->wc_flags |= IB_WC_WITH_INVALIDATE;
+                               uwc->ex.invalidate_rkey = ieth_rkey(pkt);
+                       }
+
+                       uwc->qp_num             = qp->ibqp.qp_num;
+
+                       if (pkt->mask & RXE_DETH_MASK)
+                               uwc->src_qp = deth_sqp(pkt);
+
+                       uwc->port_num           = qp->attr.port_num;
+               } else {
+                       struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+                       wc->wc_flags = IB_WC_GRH | IB_WC_WITH_NETWORK_HDR_TYPE;
+                       if (skb->protocol == htons(ETH_P_IP))
+                               wc->network_hdr_type = RDMA_NETWORK_IPV4;
+                       else
+                               wc->network_hdr_type = RDMA_NETWORK_IPV6;
+
+                       if (pkt->mask & RXE_IMMDT_MASK) {
+                               wc->wc_flags |= IB_WC_WITH_IMM;
+                               wc->ex.imm_data = immdt_imm(pkt);
+                       }
+
+                       if (pkt->mask & RXE_IETH_MASK) {
+                               struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+                               struct rxe_mem *rmr;
+
+                               wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+                               wc->ex.invalidate_rkey = ieth_rkey(pkt);
+
+                               rmr = rxe_pool_get_index(&rxe->mr_pool,
+                                                        wc->ex.invalidate_rkey >> 8);
+                               if (unlikely(!rmr)) {
+                                       pr_err("Bad rkey %#x invalidation\n", wc->ex.invalidate_rkey);
+                                       return RESPST_ERROR;
+                               }
+                               rmr->state = RXE_MEM_STATE_FREE;
+                       }
+
+                       wc->qp                  = &qp->ibqp;
+
+                       if (pkt->mask & RXE_DETH_MASK)
+                               wc->src_qp = deth_sqp(pkt);
+
+                       wc->port_num            = qp->attr.port_num;
+               }
+       }
+
+       /* have copy for srq and reference for !srq */
+       if (!qp->srq)
+               advance_consumer(qp->rq.queue);
+
+       qp->resp.wqe = NULL;
+
+       if (rxe_cq_post(qp->rcq, &cqe, pkt ? bth_se(pkt) : 1))
+               return RESPST_ERR_CQ_OVERFLOW;
+
+       if (qp->resp.state == QP_STATE_ERROR)
+               return RESPST_CHK_RESOURCE;
+
+       if (!pkt)
+               return RESPST_DONE;
+       else if (qp_type(qp) == IB_QPT_RC)
+               return RESPST_ACKNOWLEDGE;
+       else
+               return RESPST_CLEANUP;
+}
+
+static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
+                   u8 syndrome, u32 psn)
+{
+       int err = 0;
+       struct rxe_pkt_info ack_pkt;
+       struct sk_buff *skb;
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+       skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE,
+                                0, psn, syndrome, NULL);
+       if (!skb) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+       if (err) {
+               pr_err_ratelimited("Failed sending ack\n");
+               kfree_skb(skb);
+       }
+
+err1:
+       return err;
+}
+
+static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
+                          u8 syndrome)
+{
+       int rc = 0;
+       struct rxe_pkt_info ack_pkt;
+       struct sk_buff *skb;
+       struct sk_buff *skb_copy;
+       struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+       struct resp_res *res;
+
+       skb = prepare_ack_packet(qp, pkt, &ack_pkt,
+                                IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn,
+                                syndrome, NULL);
+       if (!skb) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       skb_copy = skb_clone(skb, GFP_ATOMIC);
+       if (skb_copy)
+               rxe_add_ref(qp); /* for the new SKB */
+       else {
+               pr_warn("Could not clone atomic response\n");
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       res = &qp->resp.resources[qp->resp.res_head];
+       free_rd_atomic_resource(qp, res);
+       rxe_advance_resp_resource(qp);
+
+       res->type = RXE_ATOMIC_MASK;
+       res->atomic.skb = skb;
+       res->first_psn = qp->resp.psn;
+       res->last_psn = qp->resp.psn;
+       res->cur_psn = qp->resp.psn;
+
+       rc = rxe_xmit_packet(rxe, qp, &ack_pkt, skb_copy);
+       if (rc) {
+               pr_err_ratelimited("Failed sending ack\n");
+               rxe_drop_ref(qp);
+               kfree_skb(skb_copy);
+       }
+
+out:
+       return rc;
+}
+
+static enum resp_states acknowledge(struct rxe_qp *qp,
+                                   struct rxe_pkt_info *pkt)
+{
+       if (qp_type(qp) != IB_QPT_RC)
+               return RESPST_CLEANUP;
+
+       if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED)
+               send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn);
+       else if (pkt->mask & RXE_ATOMIC_MASK)
+               send_atomic_ack(qp, pkt, AETH_ACK_UNLIMITED);
+       else if (bth_ack(pkt))
+               send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn);
+
+       return RESPST_CLEANUP;
+}
+
+static enum resp_states cleanup(struct rxe_qp *qp,
+                               struct rxe_pkt_info *pkt)
+{
+       struct sk_buff *skb;
+
+       if (pkt) {
+               skb = skb_dequeue(&qp->req_pkts);
+               rxe_drop_ref(qp);
+               kfree_skb(skb);
+       }
+
+       if (qp->resp.mr) {
+               rxe_drop_ref(qp->resp.mr);
+               qp->resp.mr = NULL;
+       }
+
+       return RESPST_DONE;
+}
+
+static struct resp_res *find_resource(struct rxe_qp *qp, u32 psn)
+{
+       int i;
+
+       for (i = 0; i < qp->attr.max_rd_atomic; i++) {
+               struct resp_res *res = &qp->resp.resources[i];
+
+               if (res->type == 0)
+                       continue;
+
+               if (psn_compare(psn, res->first_psn) >= 0 &&
+                   psn_compare(psn, res->last_psn) <= 0) {
+                       return res;
+               }
+       }
+
+       return NULL;
+}
+
+static enum resp_states duplicate_request(struct rxe_qp *qp,
+                                         struct rxe_pkt_info *pkt)
+{
+       enum resp_states rc;
+
+       if (pkt->mask & RXE_SEND_MASK ||
+           pkt->mask & RXE_WRITE_MASK) {
+               /* SEND. Ack again and cleanup. C9-105. */
+               if (bth_ack(pkt))
+                       send_ack(qp, pkt, AETH_ACK_UNLIMITED, qp->resp.psn - 1);
+               rc = RESPST_CLEANUP;
+               goto out;
+       } else if (pkt->mask & RXE_READ_MASK) {
+               struct resp_res *res;
+
+               res = find_resource(qp, pkt->psn);
+               if (!res) {
+                       /* Resource not found. Class D error.  Drop the
+                        * request.
+                        */
+                       rc = RESPST_CLEANUP;
+                       goto out;
+               } else {
+                       /* Ensure this new request is the same as the previous
+                        * one or a subset of it.
+                        */
+                       u64 iova = reth_va(pkt);
+                       u32 resid = reth_len(pkt);
+
+                       if (iova < res->read.va_org ||
+                           resid > res->read.length ||
+                           (iova + resid) > (res->read.va_org +
+                                             res->read.length)) {
+                               rc = RESPST_CLEANUP;
+                               goto out;
+                       }
+
+                       if (reth_rkey(pkt) != res->read.rkey) {
+                               rc = RESPST_CLEANUP;
+                               goto out;
+                       }
+
+                       res->cur_psn = pkt->psn;
+                       res->state = (pkt->psn == res->first_psn) ?
+                                       rdatm_res_state_new :
+                                       rdatm_res_state_replay;
+
+                       /* Reset the resource, except length. */
+                       res->read.va_org = iova;
+                       res->read.va = iova;
+                       res->read.resid = resid;
+
+                       /* Replay the RDMA read reply. */
+                       qp->resp.res = res;
+                       rc = RESPST_READ_REPLY;
+                       goto out;
+               }
+       } else {
+               struct resp_res *res;
+
+               /* Find the operation in our list of responder resources. */
+               res = find_resource(qp, pkt->psn);
+               if (res) {
+                       struct sk_buff *skb_copy;
+
+                       skb_copy = skb_clone(res->atomic.skb, GFP_ATOMIC);
+                       if (skb_copy) {
+                               rxe_add_ref(qp); /* for the new SKB */
+                       } else {
+                               pr_warn("Couldn't clone atomic resp\n");
+                               rc = RESPST_CLEANUP;
+                               goto out;
+                       }
+                       bth_set_psn(SKB_TO_PKT(skb_copy),
+                                   qp->resp.psn - 1);
+                       /* Resend the result. */
+                       rc = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp,
+                                            pkt, skb_copy);
+                       if (rc) {
+                               pr_err("Failed resending result. This flow is not handled - skb ignored\n");
+                               kfree_skb(skb_copy);
+                               rc = RESPST_CLEANUP;
+                               goto out;
+                       }
+               }
+
+               /* Resource not found. Class D error. Drop the request. */
+               rc = RESPST_CLEANUP;
+               goto out;
+       }
+out:
+       return rc;
+}
+
+/* Process a class A or C. Both are treated the same in this implementation. */
+static void do_class_ac_error(struct rxe_qp *qp, u8 syndrome,
+                             enum ib_wc_status status)
+{
+       qp->resp.aeth_syndrome  = syndrome;
+       qp->resp.status         = status;
+
+       /* indicate that we should go through the ERROR state */
+       qp->resp.goto_error     = 1;
+}
+
+static enum resp_states do_class_d1e_error(struct rxe_qp *qp)
+{
+       /* UC */
+       if (qp->srq) {
+               /* Class E */
+               qp->resp.drop_msg = 1;
+               if (qp->resp.wqe) {
+                       qp->resp.status = IB_WC_REM_INV_REQ_ERR;
+                       return RESPST_COMPLETE;
+               } else {
+                       return RESPST_CLEANUP;
+               }
+       } else {
+               /* Class D1. This packet may be the start of a
+                * new message and could be valid. The previous
+                * message is invalid and ignored. reset the
+                * recv wr to its original state
+                */
+               if (qp->resp.wqe) {
+                       qp->resp.wqe->dma.resid = qp->resp.wqe->dma.length;
+                       qp->resp.wqe->dma.cur_sge = 0;
+                       qp->resp.wqe->dma.sge_offset = 0;
+                       qp->resp.opcode = -1;
+               }
+
+               if (qp->resp.mr) {
+                       rxe_drop_ref(qp->resp.mr);
+                       qp->resp.mr = NULL;
+               }
+
+               return RESPST_CLEANUP;
+       }
+}
+
+int rxe_responder(void *arg)
+{
+       struct rxe_qp *qp = (struct rxe_qp *)arg;
+       enum resp_states state;
+       struct rxe_pkt_info *pkt = NULL;
+       int ret = 0;
+
+       qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED;
+
+       if (!qp->valid) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       switch (qp->resp.state) {
+       case QP_STATE_RESET:
+               state = RESPST_RESET;
+               break;
+
+       default:
+               state = RESPST_GET_REQ;
+               break;
+       }
+
+       while (1) {
+               pr_debug("state = %s\n", resp_state_name[state]);
+               switch (state) {
+               case RESPST_GET_REQ:
+                       state = get_req(qp, &pkt);
+                       break;
+               case RESPST_CHK_PSN:
+                       state = check_psn(qp, pkt);
+                       break;
+               case RESPST_CHK_OP_SEQ:
+                       state = check_op_seq(qp, pkt);
+                       break;
+               case RESPST_CHK_OP_VALID:
+                       state = check_op_valid(qp, pkt);
+                       break;
+               case RESPST_CHK_RESOURCE:
+                       state = check_resource(qp, pkt);
+                       break;
+               case RESPST_CHK_LENGTH:
+                       state = check_length(qp, pkt);
+                       break;
+               case RESPST_CHK_RKEY:
+                       state = check_rkey(qp, pkt);
+                       break;
+               case RESPST_EXECUTE:
+                       state = execute(qp, pkt);
+                       break;
+               case RESPST_COMPLETE:
+                       state = do_complete(qp, pkt);
+                       break;
+               case RESPST_READ_REPLY:
+                       state = read_reply(qp, pkt);
+                       break;
+               case RESPST_ACKNOWLEDGE:
+                       state = acknowledge(qp, pkt);
+                       break;
+               case RESPST_CLEANUP:
+                       state = cleanup(qp, pkt);
+                       break;
+               case RESPST_DUPLICATE_REQUEST:
+                       state = duplicate_request(qp, pkt);
+                       break;
+               case RESPST_ERR_PSN_OUT_OF_SEQ:
+                       /* RC only - Class B. Drop packet. */
+                       send_ack(qp, pkt, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn);
+                       state = RESPST_CLEANUP;
+                       break;
+
+               case RESPST_ERR_TOO_MANY_RDMA_ATM_REQ:
+               case RESPST_ERR_MISSING_OPCODE_FIRST:
+               case RESPST_ERR_MISSING_OPCODE_LAST_C:
+               case RESPST_ERR_UNSUPPORTED_OPCODE:
+               case RESPST_ERR_MISALIGNED_ATOMIC:
+                       /* RC Only - Class C. */
+                       do_class_ac_error(qp, AETH_NAK_INVALID_REQ,
+                                         IB_WC_REM_INV_REQ_ERR);
+                       state = RESPST_COMPLETE;
+                       break;
+
+               case RESPST_ERR_MISSING_OPCODE_LAST_D1E:
+                       state = do_class_d1e_error(qp);
+                       break;
+               case RESPST_ERR_RNR:
+                       if (qp_type(qp) == IB_QPT_RC) {
+                               /* RC - class B */
+                               send_ack(qp, pkt, AETH_RNR_NAK |
+                                        (~AETH_TYPE_MASK &
+                                        qp->attr.min_rnr_timer),
+                                        pkt->psn);
+                       } else {
+                               /* UD/UC - class D */
+                               qp->resp.drop_msg = 1;
+                       }
+                       state = RESPST_CLEANUP;
+                       break;
+
+               case RESPST_ERR_RKEY_VIOLATION:
+                       if (qp_type(qp) == IB_QPT_RC) {
+                               /* Class C */
+                               do_class_ac_error(qp, AETH_NAK_REM_ACC_ERR,
+                                                 IB_WC_REM_ACCESS_ERR);
+                               state = RESPST_COMPLETE;
+                       } else {
+                               qp->resp.drop_msg = 1;
+                               if (qp->srq) {
+                                       /* UC/SRQ Class D */
+                                       qp->resp.status = IB_WC_REM_ACCESS_ERR;
+                                       state = RESPST_COMPLETE;
+                               } else {
+                                       /* UC/non-SRQ Class E. */
+                                       state = RESPST_CLEANUP;
+                               }
+                       }
+                       break;
+
+               case RESPST_ERR_LENGTH:
+                       if (qp_type(qp) == IB_QPT_RC) {
+                               /* Class C */
+                               do_class_ac_error(qp, AETH_NAK_INVALID_REQ,
+                                                 IB_WC_REM_INV_REQ_ERR);
+                               state = RESPST_COMPLETE;
+                       } else if (qp->srq) {
+                               /* UC/UD - class E */
+                               qp->resp.status = IB_WC_REM_INV_REQ_ERR;
+                               state = RESPST_COMPLETE;
+                       } else {
+                               /* UC/UD - class D */
+                               qp->resp.drop_msg = 1;
+                               state = RESPST_CLEANUP;
+                       }
+                       break;
+
+               case RESPST_ERR_MALFORMED_WQE:
+                       /* All, Class A. */
+                       do_class_ac_error(qp, AETH_NAK_REM_OP_ERR,
+                                         IB_WC_LOC_QP_OP_ERR);
+                       state = RESPST_COMPLETE;
+                       break;
+
+               case RESPST_ERR_CQ_OVERFLOW:
+                       /* All - Class G */
+                       state = RESPST_ERROR;
+                       break;
+
+               case RESPST_DONE:
+                       if (qp->resp.goto_error) {
+                               state = RESPST_ERROR;
+                               break;
+                       }
+
+                       goto done;
+
+               case RESPST_EXIT:
+                       if (qp->resp.goto_error) {
+                               state = RESPST_ERROR;
+                               break;
+                       }
+
+                       goto exit;
+
+               case RESPST_RESET: {
+                       struct sk_buff *skb;
+
+                       while ((skb = skb_dequeue(&qp->req_pkts))) {
+                               rxe_drop_ref(qp);
+                               kfree_skb(skb);
+                       }
+
+                       while (!qp->srq && qp->rq.queue &&
+                              queue_head(qp->rq.queue))
+                               advance_consumer(qp->rq.queue);
+
+                       qp->resp.wqe = NULL;
+                       goto exit;
+               }
+
+               case RESPST_ERROR:
+                       qp->resp.goto_error = 0;
+                       pr_warn("qp#%d moved to error state\n", qp_num(qp));
+                       rxe_qp_error(qp);
+                       goto exit;
+
+               default:
+                       WARN_ON(1);
+               }
+       }
+
+exit:
+       ret = -EAGAIN;
+done:
+       return ret;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c

new file mode 100644 (file)

index 0000000..2a6e3cd
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_srq.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+                    struct ib_srq_attr *attr, enum ib_srq_attr_mask mask)
+{
+       if (srq && srq->error) {
+               pr_warn("srq in error state\n");
+               goto err1;
+       }
+
+       if (mask & IB_SRQ_MAX_WR) {
+               if (attr->max_wr > rxe->attr.max_srq_wr) {
+                       pr_warn("max_wr(%d) > max_srq_wr(%d)\n",
+                               attr->max_wr, rxe->attr.max_srq_wr);
+                       goto err1;
+               }
+
+               if (attr->max_wr <= 0) {
+                       pr_warn("max_wr(%d) <= 0\n", attr->max_wr);
+                       goto err1;
+               }
+
+               if (srq && srq->limit && (attr->max_wr < srq->limit)) {
+                       pr_warn("max_wr (%d) < srq->limit (%d)\n",
+                               attr->max_wr, srq->limit);
+                       goto err1;
+               }
+
+               if (attr->max_wr < RXE_MIN_SRQ_WR)
+                       attr->max_wr = RXE_MIN_SRQ_WR;
+       }
+
+       if (mask & IB_SRQ_LIMIT) {
+               if (attr->srq_limit > rxe->attr.max_srq_wr) {
+                       pr_warn("srq_limit(%d) > max_srq_wr(%d)\n",
+                               attr->srq_limit, rxe->attr.max_srq_wr);
+                       goto err1;
+               }
+
+               if (srq && (attr->srq_limit > srq->rq.queue->buf->index_mask)) {
+                       pr_warn("srq_limit (%d) > cur limit(%d)\n",
+                               attr->srq_limit,
+                                srq->rq.queue->buf->index_mask);
+                       goto err1;
+               }
+       }
+
+       if (mask == IB_SRQ_INIT_MASK) {
+               if (attr->max_sge > rxe->attr.max_srq_sge) {
+                       pr_warn("max_sge(%d) > max_srq_sge(%d)\n",
+                               attr->max_sge, rxe->attr.max_srq_sge);
+                       goto err1;
+               }
+
+               if (attr->max_sge < RXE_MIN_SRQ_SGE)
+                       attr->max_sge = RXE_MIN_SRQ_SGE;
+       }
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
+                     struct ib_srq_init_attr *init,
+                     struct ib_ucontext *context, struct ib_udata *udata)
+{
+       int err;
+       int srq_wqe_size;
+       struct rxe_queue *q;
+
+       srq->ibsrq.event_handler        = init->event_handler;
+       srq->ibsrq.srq_context          = init->srq_context;
+       srq->limit              = init->attr.srq_limit;
+       srq->srq_num            = srq->pelem.index;
+       srq->rq.max_wr          = init->attr.max_wr;
+       srq->rq.max_sge         = init->attr.max_sge;
+
+       srq_wqe_size            = rcv_wqe_size(srq->rq.max_sge);
+
+       spin_lock_init(&srq->rq.producer_lock);
+       spin_lock_init(&srq->rq.consumer_lock);
+
+       q = rxe_queue_init(rxe, &srq->rq.max_wr,
+                          srq_wqe_size);
+       if (!q) {
+               pr_warn("unable to allocate queue for srq\n");
+               return -ENOMEM;
+       }
+
+       srq->rq.queue = q;
+
+       err = do_mmap_info(rxe, udata, false, context, q->buf,
+                          q->buf_size, &q->ip);
+       if (err)
+               return err;
+
+       if (udata && udata->outlen >= sizeof(struct mminfo) + sizeof(u32)) {
+               if (copy_to_user(udata->outbuf + sizeof(struct mminfo),
+                                &srq->srq_num, sizeof(u32)))
+                       return -EFAULT;
+       }
+       return 0;
+}
+
+int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+                     struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
+                     struct ib_udata *udata)
+{
+       int err;
+       struct rxe_queue *q = srq->rq.queue;
+       struct mminfo mi = { .offset = 1, .size = 0};
+
+       if (mask & IB_SRQ_MAX_WR) {
+               /* Check that we can write the mminfo struct to user space */
+               if (udata && udata->inlen >= sizeof(__u64)) {
+                       __u64 mi_addr;
+
+                       /* Get address of user space mminfo struct */
+                       err = ib_copy_from_udata(&mi_addr, udata,
+                                                sizeof(mi_addr));
+                       if (err)
+                               goto err1;
+
+                       udata->outbuf = (void __user *)(unsigned long)mi_addr;
+                       udata->outlen = sizeof(mi);
+
+                       if (!access_ok(VERIFY_WRITE,
+                                      (void __user *)udata->outbuf,
+                                       udata->outlen)) {
+                               err = -EFAULT;
+                               goto err1;
+                       }
+               }
+
+               err = rxe_queue_resize(q, (unsigned int *)&attr->max_wr,
+                                      rcv_wqe_size(srq->rq.max_sge),
+                                      srq->rq.queue->ip ?
+                                               srq->rq.queue->ip->context :
+                                               NULL,
+                                      udata, &srq->rq.producer_lock,
+                                      &srq->rq.consumer_lock);
+               if (err)
+                       goto err2;
+       }
+
+       if (mask & IB_SRQ_LIMIT)
+               srq->limit = attr->srq_limit;
+
+       return 0;
+
+err2:
+       rxe_queue_cleanup(q);
+       srq->rq.queue = NULL;
+err1:
+       return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c

new file mode 100644 (file)

index 0000000..cf8e778
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_net.h"
+
+/* Copy argument and remove trailing CR. Return the new length. */
+static int sanitize_arg(const char *val, char *intf, int intf_len)
+{
+       int len;
+
+       if (!val)
+               return 0;
+
+       /* Remove newline. */
+       for (len = 0; len < intf_len - 1 && val[len] && val[len] != '\n'; len++)
+               intf[len] = val[len];
+       intf[len] = 0;
+
+       if (len == 0 || (val[len] != 0 && val[len] != '\n'))
+               return 0;
+
+       return len;
+}
+
+static void rxe_set_port_state(struct net_device *ndev)
+{
+       struct rxe_dev *rxe = net_to_rxe(ndev);
+       bool is_up = netif_running(ndev) && netif_carrier_ok(ndev);
+
+       if (!rxe)
+               goto out;
+
+       if (is_up)
+               rxe_port_up(rxe);
+       else
+               rxe_port_down(rxe); /* down for unknown state */
+out:
+       return;
+}
+
+static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
+{
+       int len;
+       int err = 0;
+       char intf[32];
+       struct net_device *ndev = NULL;
+       struct rxe_dev *rxe;
+
+       len = sanitize_arg(val, intf, sizeof(intf));
+       if (!len) {
+               pr_err("rxe: add: invalid interface name\n");
+               err = -EINVAL;
+               goto err;
+       }
+
+       ndev = dev_get_by_name(&init_net, intf);
+       if (!ndev) {
+               pr_err("interface %s not found\n", intf);
+               err = -EINVAL;
+               goto err;
+       }
+
+       if (net_to_rxe(ndev)) {
+               pr_err("rxe: already configured on %s\n", intf);
+               err = -EINVAL;
+               goto err;
+       }
+
+       rxe = rxe_net_add(ndev);
+       if (!rxe) {
+               pr_err("rxe: failed to add %s\n", intf);
+               err = -EINVAL;
+               goto err;
+       }
+
+       rxe_set_port_state(ndev);
+       pr_info("rxe: added %s to %s\n", rxe->ib_dev.name, intf);
+err:
+       if (ndev)
+               dev_put(ndev);
+       return err;
+}
+
+static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
+{
+       int len;
+       char intf[32];
+       struct rxe_dev *rxe;
+
+       len = sanitize_arg(val, intf, sizeof(intf));
+       if (!len) {
+               pr_err("rxe: add: invalid interface name\n");
+               return -EINVAL;
+       }
+
+       if (strncmp("all", intf, len) == 0) {
+               pr_info("rxe_sys: remove all");
+               rxe_remove_all();
+               return 0;
+       }
+
+       rxe = get_rxe_by_name(intf);
+
+       if (!rxe) {
+               pr_err("rxe: not configured on %s\n", intf);
+               return -EINVAL;
+       }
+
+       list_del(&rxe->list);
+       rxe_remove(rxe);
+
+       return 0;
+}
+
+static const struct kernel_param_ops rxe_add_ops = {
+       .set = rxe_param_set_add,
+};
+
+static const struct kernel_param_ops rxe_remove_ops = {
+       .set = rxe_param_set_remove,
+};
+
+module_param_cb(add, &rxe_add_ops, NULL, 0200);
+MODULE_PARM_DESC(add, "Create RXE device over network interface");
+module_param_cb(remove, &rxe_remove_ops, NULL, 0200);
+MODULE_PARM_DESC(remove, "Remove RXE device over network interface");
diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c

new file mode 100644 (file)

index 0000000..1e19bf8
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_task.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/hardirq.h>
+
+#include "rxe_task.h"
+
+int __rxe_do_task(struct rxe_task *task)
+
+{
+       int ret;
+
+       while ((ret = task->func(task->arg)) == 0)
+               ;
+
+       task->ret = ret;
+
+       return ret;
+}
+
+/*
+ * this locking is due to a potential race where
+ * a second caller finds the task already running
+ * but looks just after the last call to func
+ */
+void rxe_do_task(unsigned long data)
+{
+       int cont;
+       int ret;
+       unsigned long flags;
+       struct rxe_task *task = (struct rxe_task *)data;
+
+       spin_lock_irqsave(&task->state_lock, flags);
+       switch (task->state) {
+       case TASK_STATE_START:
+               task->state = TASK_STATE_BUSY;
+               spin_unlock_irqrestore(&task->state_lock, flags);
+               break;
+
+       case TASK_STATE_BUSY:
+               task->state = TASK_STATE_ARMED;
+               /* fall through to */
+       case TASK_STATE_ARMED:
+               spin_unlock_irqrestore(&task->state_lock, flags);
+               return;
+
+       default:
+               spin_unlock_irqrestore(&task->state_lock, flags);
+               pr_warn("bad state = %d in rxe_do_task\n", task->state);
+               return;
+       }
+
+       do {
+               cont = 0;
+               ret = task->func(task->arg);
+
+               spin_lock_irqsave(&task->state_lock, flags);
+               switch (task->state) {
+               case TASK_STATE_BUSY:
+                       if (ret)
+                               task->state = TASK_STATE_START;
+                       else
+                               cont = 1;
+                       break;
+
+               /* soneone tried to run the task since the last time we called
+                * func, so we will call one more time regardless of the
+                * return value
+                */
+               case TASK_STATE_ARMED:
+                       task->state = TASK_STATE_BUSY;
+                       cont = 1;
+                       break;
+
+               default:
+                       pr_warn("bad state = %d in rxe_do_task\n",
+                               task->state);
+               }
+               spin_unlock_irqrestore(&task->state_lock, flags);
+       } while (cont);
+
+       task->ret = ret;
+}
+
+int rxe_init_task(void *obj, struct rxe_task *task,
+                 void *arg, int (*func)(void *), char *name)
+{
+       task->obj       = obj;
+       task->arg       = arg;
+       task->func      = func;
+       snprintf(task->name, sizeof(task->name), "%s", name);
+
+       tasklet_init(&task->tasklet, rxe_do_task, (unsigned long)task);
+
+       task->state = TASK_STATE_START;
+       spin_lock_init(&task->state_lock);
+
+       return 0;
+}
+
+void rxe_cleanup_task(struct rxe_task *task)
+{
+       tasklet_kill(&task->tasklet);
+}
+
+void rxe_run_task(struct rxe_task *task, int sched)
+{
+       if (sched)
+               tasklet_schedule(&task->tasklet);
+       else
+               rxe_do_task((unsigned long)task);
+}
+
+void rxe_disable_task(struct rxe_task *task)
+{
+       tasklet_disable(&task->tasklet);
+}
+
+void rxe_enable_task(struct rxe_task *task)
+{
+       tasklet_enable(&task->tasklet);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h

new file mode 100644 (file)

index 0000000..d14aa6d
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_task.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_TASK_H
+#define RXE_TASK_H
+
+enum {
+       TASK_STATE_START        = 0,
+       TASK_STATE_BUSY         = 1,
+       TASK_STATE_ARMED        = 2,
+};
+
+/*
+ * data structure to describe a 'task' which is a short
+ * function that returns 0 as long as it needs to be
+ * called again.
+ */
+struct rxe_task {
+       void                    *obj;
+       struct tasklet_struct   tasklet;
+       int                     state;
+       spinlock_t              state_lock; /* spinlock for task state */
+       void                    *arg;
+       int                     (*func)(void *arg);
+       int                     ret;
+       char                    name[16];
+};
+
+/*
+ * init rxe_task structure
+ *     arg  => parameter to pass to fcn
+ *     fcn  => function to call until it returns != 0
+ */
+int rxe_init_task(void *obj, struct rxe_task *task,
+                 void *arg, int (*func)(void *), char *name);
+
+/* cleanup task */
+void rxe_cleanup_task(struct rxe_task *task);
+
+/*
+ * raw call to func in loop without any checking
+ * can call when tasklets are disabled
+ */
+int __rxe_do_task(struct rxe_task *task);
+
+/*
+ * common function called by any of the main tasklets
+ * If there is any chance that there is additional
+ * work to do someone must reschedule the task before
+ * leaving
+ */
+void rxe_do_task(unsigned long data);
+
+/* run a task, else schedule it to run as a tasklet, The decision
+ * to run or schedule tasklet is based on the parameter sched.
+ */
+void rxe_run_task(struct rxe_task *task, int sched);
+
+/* keep a task from scheduling */
+void rxe_disable_task(struct rxe_task *task);
+
+/* allow task to run */
+void rxe_enable_task(struct rxe_task *task);
+
+#endif /* RXE_TASK_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c

new file mode 100644 (file)

index 0000000..4552be9
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -0,0 +1,1330 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+static int rxe_query_device(struct ib_device *dev,
+                           struct ib_device_attr *attr,
+                           struct ib_udata *uhw)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+
+       if (uhw->inlen || uhw->outlen)
+               return -EINVAL;
+
+       *attr = rxe->attr;
+       return 0;
+}
+
+static void rxe_eth_speed_to_ib_speed(int speed, u8 *active_speed,
+                                     u8 *active_width)
+{
+       if (speed <= 1000) {
+               *active_width = IB_WIDTH_1X;
+               *active_speed = IB_SPEED_SDR;
+       } else if (speed <= 10000) {
+               *active_width = IB_WIDTH_1X;
+               *active_speed = IB_SPEED_FDR10;
+       } else if (speed <= 20000) {
+               *active_width = IB_WIDTH_4X;
+               *active_speed = IB_SPEED_DDR;
+       } else if (speed <= 30000) {
+               *active_width = IB_WIDTH_4X;
+               *active_speed = IB_SPEED_QDR;
+       } else if (speed <= 40000) {
+               *active_width = IB_WIDTH_4X;
+               *active_speed = IB_SPEED_FDR10;
+       } else {
+               *active_width = IB_WIDTH_4X;
+               *active_speed = IB_SPEED_EDR;
+       }
+}
+
+static int rxe_query_port(struct ib_device *dev,
+                         u8 port_num, struct ib_port_attr *attr)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+       struct rxe_port *port;
+       u32 speed;
+
+       if (unlikely(port_num != 1)) {
+               pr_warn("invalid port_number %d\n", port_num);
+               goto err1;
+       }
+
+       port = &rxe->port;
+
+       *attr = port->attr;
+
+       mutex_lock(&rxe->usdev_lock);
+       if (rxe->ndev->ethtool_ops->get_link_ksettings) {
+               struct ethtool_link_ksettings ks;
+
+               rxe->ndev->ethtool_ops->get_link_ksettings(rxe->ndev, &ks);
+               speed = ks.base.speed;
+       } else if (rxe->ndev->ethtool_ops->get_settings) {
+               struct ethtool_cmd cmd;
+
+               rxe->ndev->ethtool_ops->get_settings(rxe->ndev, &cmd);
+               speed = cmd.speed;
+       } else {
+               pr_warn("%s speed is unknown, defaulting to 1000\n", rxe->ndev->name);
+               speed = 1000;
+       }
+       rxe_eth_speed_to_ib_speed(speed, &attr->active_speed, &attr->active_width);
+       mutex_unlock(&rxe->usdev_lock);
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static int rxe_query_gid(struct ib_device *device,
+                        u8 port_num, int index, union ib_gid *gid)
+{
+       int ret;
+
+       if (index > RXE_PORT_GID_TBL_LEN)
+               return -EINVAL;
+
+       ret = ib_get_cached_gid(device, port_num, index, gid, NULL);
+       if (ret == -EAGAIN) {
+               memcpy(gid, &zgid, sizeof(*gid));
+               return 0;
+       }
+
+       return ret;
+}
+
+static int rxe_add_gid(struct ib_device *device, u8 port_num, unsigned int
+                      index, const union ib_gid *gid,
+                      const struct ib_gid_attr *attr, void **context)
+{
+       if (index >= RXE_PORT_GID_TBL_LEN)
+               return -EINVAL;
+       return 0;
+}
+
+static int rxe_del_gid(struct ib_device *device, u8 port_num, unsigned int
+                      index, void **context)
+{
+       if (index >= RXE_PORT_GID_TBL_LEN)
+               return -EINVAL;
+       return 0;
+}
+
+static struct net_device *rxe_get_netdev(struct ib_device *device,
+                                        u8 port_num)
+{
+       struct rxe_dev *rxe = to_rdev(device);
+
+       if (rxe->ndev) {
+               dev_hold(rxe->ndev);
+               return rxe->ndev;
+       }
+
+       return NULL;
+}
+
+static int rxe_query_pkey(struct ib_device *device,
+                         u8 port_num, u16 index, u16 *pkey)
+{
+       struct rxe_dev *rxe = to_rdev(device);
+       struct rxe_port *port;
+
+       if (unlikely(port_num != 1)) {
+               dev_warn(device->dma_device, "invalid port_num = %d\n",
+                        port_num);
+               goto err1;
+       }
+
+       port = &rxe->port;
+
+       if (unlikely(index >= port->attr.pkey_tbl_len)) {
+               dev_warn(device->dma_device, "invalid index = %d\n",
+                        index);
+               goto err1;
+       }
+
+       *pkey = port->pkey_tbl[index];
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static int rxe_modify_device(struct ib_device *dev,
+                            int mask, struct ib_device_modify *attr)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+
+       if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
+               rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid);
+
+       if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+               memcpy(rxe->ib_dev.node_desc,
+                      attr->node_desc, sizeof(rxe->ib_dev.node_desc));
+       }
+
+       return 0;
+}
+
+static int rxe_modify_port(struct ib_device *dev,
+                          u8 port_num, int mask, struct ib_port_modify *attr)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+       struct rxe_port *port;
+
+       if (unlikely(port_num != 1)) {
+               pr_warn("invalid port_num = %d\n", port_num);
+               goto err1;
+       }
+
+       port = &rxe->port;
+
+       port->attr.port_cap_flags |= attr->set_port_cap_mask;
+       port->attr.port_cap_flags &= ~attr->clr_port_cap_mask;
+
+       if (mask & IB_PORT_RESET_QKEY_CNTR)
+               port->attr.qkey_viol_cntr = 0;
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev,
+                                              u8 port_num)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+
+       return rxe->ifc_ops->link_layer(rxe, port_num);
+}
+
+static struct ib_ucontext *rxe_alloc_ucontext(struct ib_device *dev,
+                                             struct ib_udata *udata)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+       struct rxe_ucontext *uc;
+
+       uc = rxe_alloc(&rxe->uc_pool);
+       return uc ? &uc->ibuc : ERR_PTR(-ENOMEM);
+}
+
+static int rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
+{
+       struct rxe_ucontext *uc = to_ruc(ibuc);
+
+       rxe_drop_ref(uc);
+       return 0;
+}
+
+static int rxe_port_immutable(struct ib_device *dev, u8 port_num,
+                             struct ib_port_immutable *immutable)
+{
+       int err;
+       struct ib_port_attr attr;
+
+       err = rxe_query_port(dev, port_num, &attr);
+       if (err)
+               return err;
+
+       immutable->pkey_tbl_len = attr.pkey_tbl_len;
+       immutable->gid_tbl_len = attr.gid_tbl_len;
+       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+       immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+       return 0;
+}
+
+static struct ib_pd *rxe_alloc_pd(struct ib_device *dev,
+                                 struct ib_ucontext *context,
+                                 struct ib_udata *udata)
+{
+       struct rxe_dev *rxe = to_rdev(dev);
+       struct rxe_pd *pd;
+
+       pd = rxe_alloc(&rxe->pd_pool);
+       return pd ? &pd->ibpd : ERR_PTR(-ENOMEM);
+}
+
+static int rxe_dealloc_pd(struct ib_pd *ibpd)
+{
+       struct rxe_pd *pd = to_rpd(ibpd);
+
+       rxe_drop_ref(pd);
+       return 0;
+}
+
+static int rxe_init_av(struct rxe_dev *rxe, struct ib_ah_attr *attr,
+                      struct rxe_av *av)
+{
+       int err;
+       union ib_gid sgid;
+       struct ib_gid_attr sgid_attr;
+
+       err = ib_get_cached_gid(&rxe->ib_dev, attr->port_num,
+                               attr->grh.sgid_index, &sgid,
+                               &sgid_attr);
+       if (err) {
+               pr_err("Failed to query sgid. err = %d\n", err);
+               return err;
+       }
+
+       err = rxe_av_from_attr(rxe, attr->port_num, av, attr);
+       if (!err)
+               err = rxe_av_fill_ip_info(rxe, av, attr, &sgid_attr, &sgid);
+
+       if (sgid_attr.ndev)
+               dev_put(sgid_attr.ndev);
+       return err;
+}
+
+static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibpd->device);
+       struct rxe_pd *pd = to_rpd(ibpd);
+       struct rxe_ah *ah;
+
+       err = rxe_av_chk_attr(rxe, attr);
+       if (err)
+               goto err1;
+
+       ah = rxe_alloc(&rxe->ah_pool);
+       if (!ah) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       rxe_add_ref(pd);
+       ah->pd = pd;
+
+       err = rxe_init_av(rxe, attr, &ah->av);
+       if (err)
+               goto err2;
+
+       return &ah->ibah;
+
+err2:
+       rxe_drop_ref(pd);
+       rxe_drop_ref(ah);
+err1:
+       return ERR_PTR(err);
+}
+
+static int rxe_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibah->device);
+       struct rxe_ah *ah = to_rah(ibah);
+
+       err = rxe_av_chk_attr(rxe, attr);
+       if (err)
+               return err;
+
+       err = rxe_init_av(rxe, attr, &ah->av);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int rxe_query_ah(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+       struct rxe_dev *rxe = to_rdev(ibah->device);
+       struct rxe_ah *ah = to_rah(ibah);
+
+       rxe_av_to_attr(rxe, &ah->av, attr);
+       return 0;
+}
+
+static int rxe_destroy_ah(struct ib_ah *ibah)
+{
+       struct rxe_ah *ah = to_rah(ibah);
+
+       rxe_drop_ref(ah->pd);
+       rxe_drop_ref(ah);
+       return 0;
+}
+
+static int post_one_recv(struct rxe_rq *rq, struct ib_recv_wr *ibwr)
+{
+       int err;
+       int i;
+       u32 length;
+       struct rxe_recv_wqe *recv_wqe;
+       int num_sge = ibwr->num_sge;
+
+       if (unlikely(queue_full(rq->queue))) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       if (unlikely(num_sge > rq->max_sge)) {
+               err = -EINVAL;
+               goto err1;
+       }
+
+       length = 0;
+       for (i = 0; i < num_sge; i++)
+               length += ibwr->sg_list[i].length;
+
+       recv_wqe = producer_addr(rq->queue);
+       recv_wqe->wr_id = ibwr->wr_id;
+       recv_wqe->num_sge = num_sge;
+
+       memcpy(recv_wqe->dma.sge, ibwr->sg_list,
+              num_sge * sizeof(struct ib_sge));
+
+       recv_wqe->dma.length            = length;
+       recv_wqe->dma.resid             = length;
+       recv_wqe->dma.num_sge           = num_sge;
+       recv_wqe->dma.cur_sge           = 0;
+       recv_wqe->dma.sge_offset        = 0;
+
+       /* make sure all changes to the work queue are written before we
+        * update the producer pointer
+        */
+       smp_wmb();
+
+       advance_producer(rq->queue);
+       return 0;
+
+err1:
+       return err;
+}
+
+static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd,
+                                    struct ib_srq_init_attr *init,
+                                    struct ib_udata *udata)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibpd->device);
+       struct rxe_pd *pd = to_rpd(ibpd);
+       struct rxe_srq *srq;
+       struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL;
+
+       err = rxe_srq_chk_attr(rxe, NULL, &init->attr, IB_SRQ_INIT_MASK);
+       if (err)
+               goto err1;
+
+       srq = rxe_alloc(&rxe->srq_pool);
+       if (!srq) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       rxe_add_index(srq);
+       rxe_add_ref(pd);
+       srq->pd = pd;
+
+       err = rxe_srq_from_init(rxe, srq, init, context, udata);
+       if (err)
+               goto err2;
+
+       return &srq->ibsrq;
+
+err2:
+       rxe_drop_ref(pd);
+       rxe_drop_index(srq);
+       rxe_drop_ref(srq);
+err1:
+       return ERR_PTR(err);
+}
+
+static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+                         enum ib_srq_attr_mask mask,
+                         struct ib_udata *udata)
+{
+       int err;
+       struct rxe_srq *srq = to_rsrq(ibsrq);
+       struct rxe_dev *rxe = to_rdev(ibsrq->device);
+
+       err = rxe_srq_chk_attr(rxe, srq, attr, mask);
+       if (err)
+               goto err1;
+
+       err = rxe_srq_from_attr(rxe, srq, attr, mask, udata);
+       if (err)
+               goto err1;
+
+       return 0;
+
+err1:
+       return err;
+}
+
+static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+       struct rxe_srq *srq = to_rsrq(ibsrq);
+
+       if (srq->error)
+               return -EINVAL;
+
+       attr->max_wr = srq->rq.queue->buf->index_mask;
+       attr->max_sge = srq->rq.max_sge;
+       attr->srq_limit = srq->limit;
+       return 0;
+}
+
+static int rxe_destroy_srq(struct ib_srq *ibsrq)
+{
+       struct rxe_srq *srq = to_rsrq(ibsrq);
+
+       if (srq->rq.queue)
+               rxe_queue_cleanup(srq->rq.queue);
+
+       rxe_drop_ref(srq->pd);
+       rxe_drop_index(srq);
+       rxe_drop_ref(srq);
+
+       return 0;
+}
+
+static int rxe_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+                            struct ib_recv_wr **bad_wr)
+{
+       int err = 0;
+       unsigned long flags;
+       struct rxe_srq *srq = to_rsrq(ibsrq);
+
+       spin_lock_irqsave(&srq->rq.producer_lock, flags);
+
+       while (wr) {
+               err = post_one_recv(&srq->rq, wr);
+               if (unlikely(err))
+                       break;
+               wr = wr->next;
+       }
+
+       spin_unlock_irqrestore(&srq->rq.producer_lock, flags);
+
+       if (err)
+               *bad_wr = wr;
+
+       return err;
+}
+
+static struct ib_qp *rxe_create_qp(struct ib_pd *ibpd,
+                                  struct ib_qp_init_attr *init,
+                                  struct ib_udata *udata)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibpd->device);
+       struct rxe_pd *pd = to_rpd(ibpd);
+       struct rxe_qp *qp;
+
+       err = rxe_qp_chk_init(rxe, init);
+       if (err)
+               goto err1;
+
+       qp = rxe_alloc(&rxe->qp_pool);
+       if (!qp) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       if (udata) {
+               if (udata->inlen) {
+                       err = -EINVAL;
+                       goto err1;
+               }
+               qp->is_user = 1;
+       }
+
+       rxe_add_index(qp);
+
+       err = rxe_qp_from_init(rxe, qp, pd, init, udata, ibpd);
+       if (err)
+               goto err2;
+
+       return &qp->ibqp;
+
+err2:
+       rxe_drop_index(qp);
+       rxe_drop_ref(qp);
+err1:
+       return ERR_PTR(err);
+}
+
+static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                        int mask, struct ib_udata *udata)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibqp->device);
+       struct rxe_qp *qp = to_rqp(ibqp);
+
+       err = rxe_qp_chk_attr(rxe, qp, attr, mask);
+       if (err)
+               goto err1;
+
+       err = rxe_qp_from_attr(qp, attr, mask, udata);
+       if (err)
+               goto err1;
+
+       return 0;
+
+err1:
+       return err;
+}
+
+static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                       int mask, struct ib_qp_init_attr *init)
+{
+       struct rxe_qp *qp = to_rqp(ibqp);
+
+       rxe_qp_to_init(qp, init);
+       rxe_qp_to_attr(qp, attr, mask);
+
+       return 0;
+}
+
+static int rxe_destroy_qp(struct ib_qp *ibqp)
+{
+       struct rxe_qp *qp = to_rqp(ibqp);
+
+       rxe_qp_destroy(qp);
+       rxe_drop_index(qp);
+       rxe_drop_ref(qp);
+       return 0;
+}
+
+static int validate_send_wr(struct rxe_qp *qp, struct ib_send_wr *ibwr,
+                           unsigned int mask, unsigned int length)
+{
+       int num_sge = ibwr->num_sge;
+       struct rxe_sq *sq = &qp->sq;
+
+       if (unlikely(num_sge > sq->max_sge))
+               goto err1;
+
+       if (unlikely(mask & WR_ATOMIC_MASK)) {
+               if (length < 8)
+                       goto err1;
+
+               if (atomic_wr(ibwr)->remote_addr & 0x7)
+                       goto err1;
+       }
+
+       if (unlikely((ibwr->send_flags & IB_SEND_INLINE) &&
+                    (length > sq->max_inline)))
+               goto err1;
+
+       return 0;
+
+err1:
+       return -EINVAL;
+}
+
+static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr,
+                        struct ib_send_wr *ibwr)
+{
+       wr->wr_id = ibwr->wr_id;
+       wr->num_sge = ibwr->num_sge;
+       wr->opcode = ibwr->opcode;
+       wr->send_flags = ibwr->send_flags;
+
+       if (qp_type(qp) == IB_QPT_UD ||
+           qp_type(qp) == IB_QPT_SMI ||
+           qp_type(qp) == IB_QPT_GSI) {
+               wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn;
+               wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey;
+               if (qp_type(qp) == IB_QPT_GSI)
+                       wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index;
+               if (wr->opcode == IB_WR_SEND_WITH_IMM)
+                       wr->ex.imm_data = ibwr->ex.imm_data;
+       } else {
+               switch (wr->opcode) {
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       wr->ex.imm_data = ibwr->ex.imm_data;
+               case IB_WR_RDMA_READ:
+               case IB_WR_RDMA_WRITE:
+                       wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr;
+                       wr->wr.rdma.rkey        = rdma_wr(ibwr)->rkey;
+                       break;
+               case IB_WR_SEND_WITH_IMM:
+                       wr->ex.imm_data = ibwr->ex.imm_data;
+                       break;
+               case IB_WR_SEND_WITH_INV:
+                       wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey;
+                       break;
+               case IB_WR_ATOMIC_CMP_AND_SWP:
+               case IB_WR_ATOMIC_FETCH_AND_ADD:
+                       wr->wr.atomic.remote_addr =
+                               atomic_wr(ibwr)->remote_addr;
+                       wr->wr.atomic.compare_add =
+                               atomic_wr(ibwr)->compare_add;
+                       wr->wr.atomic.swap = atomic_wr(ibwr)->swap;
+                       wr->wr.atomic.rkey = atomic_wr(ibwr)->rkey;
+                       break;
+               case IB_WR_LOCAL_INV:
+                       wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey;
+               break;
+               case IB_WR_REG_MR:
+                       wr->wr.reg.mr = reg_wr(ibwr)->mr;
+                       wr->wr.reg.key = reg_wr(ibwr)->key;
+                       wr->wr.reg.access = reg_wr(ibwr)->access;
+               break;
+               default:
+                       break;
+               }
+       }
+}
+
+static int init_send_wqe(struct rxe_qp *qp, struct ib_send_wr *ibwr,
+                        unsigned int mask, unsigned int length,
+                        struct rxe_send_wqe *wqe)
+{
+       int num_sge = ibwr->num_sge;
+       struct ib_sge *sge;
+       int i;
+       u8 *p;
+
+       init_send_wr(qp, &wqe->wr, ibwr);
+
+       if (qp_type(qp) == IB_QPT_UD ||
+           qp_type(qp) == IB_QPT_SMI ||
+           qp_type(qp) == IB_QPT_GSI)
+               memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av));
+
+       if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) {
+               p = wqe->dma.inline_data;
+
+               sge = ibwr->sg_list;
+               for (i = 0; i < num_sge; i++, sge++) {
+                       if (qp->is_user && copy_from_user(p, (__user void *)
+                                           (uintptr_t)sge->addr, sge->length))
+                               return -EFAULT;
+
+                       else if (!qp->is_user)
+                               memcpy(p, (void *)(uintptr_t)sge->addr,
+                                      sge->length);
+
+                       p += sge->length;
+               }
+       } else if (mask & WR_REG_MASK) {
+               wqe->mask = mask;
+               wqe->state = wqe_state_posted;
+               return 0;
+       } else
+               memcpy(wqe->dma.sge, ibwr->sg_list,
+                      num_sge * sizeof(struct ib_sge));
+
+       wqe->iova               = (mask & WR_ATOMIC_MASK) ?
+                                       atomic_wr(ibwr)->remote_addr :
+                                       rdma_wr(ibwr)->remote_addr;
+       wqe->mask               = mask;
+       wqe->dma.length         = length;
+       wqe->dma.resid          = length;
+       wqe->dma.num_sge        = num_sge;
+       wqe->dma.cur_sge        = 0;
+       wqe->dma.sge_offset     = 0;
+       wqe->state              = wqe_state_posted;
+       wqe->ssn                = atomic_add_return(1, &qp->ssn);
+
+       return 0;
+}
+
+static int post_one_send(struct rxe_qp *qp, struct ib_send_wr *ibwr,
+                        unsigned mask, u32 length)
+{
+       int err;
+       struct rxe_sq *sq = &qp->sq;
+       struct rxe_send_wqe *send_wqe;
+       unsigned long flags;
+
+       err = validate_send_wr(qp, ibwr, mask, length);
+       if (err)
+               return err;
+
+       spin_lock_irqsave(&qp->sq.sq_lock, flags);
+
+       if (unlikely(queue_full(sq->queue))) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       send_wqe = producer_addr(sq->queue);
+
+       err = init_send_wqe(qp, ibwr, mask, length, send_wqe);
+       if (unlikely(err))
+               goto err1;
+
+       /*
+        * make sure all changes to the work queue are
+        * written before we update the producer pointer
+        */
+       smp_wmb();
+
+       advance_producer(sq->queue);
+       spin_unlock_irqrestore(&qp->sq.sq_lock, flags);
+
+       return 0;
+
+err1:
+       spin_unlock_irqrestore(&qp->sq.sq_lock, flags);
+       return err;
+}
+
+static int rxe_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+                        struct ib_send_wr **bad_wr)
+{
+       int err = 0;
+       struct rxe_qp *qp = to_rqp(ibqp);
+       unsigned int mask;
+       unsigned int length = 0;
+       int i;
+       int must_sched;
+
+       if (unlikely(!qp->valid)) {
+               *bad_wr = wr;
+               return -EINVAL;
+       }
+
+       if (unlikely(qp->req.state < QP_STATE_READY)) {
+               *bad_wr = wr;
+               return -EINVAL;
+       }
+
+       while (wr) {
+               mask = wr_opcode_mask(wr->opcode, qp);
+               if (unlikely(!mask)) {
+                       err = -EINVAL;
+                       *bad_wr = wr;
+                       break;
+               }
+
+               if (unlikely((wr->send_flags & IB_SEND_INLINE) &&
+                            !(mask & WR_INLINE_MASK))) {
+                       err = -EINVAL;
+                       *bad_wr = wr;
+                       break;
+               }
+
+               length = 0;
+               for (i = 0; i < wr->num_sge; i++)
+                       length += wr->sg_list[i].length;
+
+               err = post_one_send(qp, wr, mask, length);
+
+               if (err) {
+                       *bad_wr = wr;
+                       break;
+               }
+               wr = wr->next;
+       }
+
+       /*
+        * Must sched in case of GSI QP because ib_send_mad() hold irq lock,
+        * and the requester call ip_local_out_sk() that takes spin_lock_bh.
+        */
+       must_sched = (qp_type(qp) == IB_QPT_GSI) ||
+                       (queue_count(qp->sq.queue) > 1);
+
+       rxe_run_task(&qp->req.task, must_sched);
+
+       return err;
+}
+
+static int rxe_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+                        struct ib_recv_wr **bad_wr)
+{
+       int err = 0;
+       struct rxe_qp *qp = to_rqp(ibqp);
+       struct rxe_rq *rq = &qp->rq;
+       unsigned long flags;
+
+       if (unlikely((qp_state(qp) < IB_QPS_INIT) || !qp->valid)) {
+               *bad_wr = wr;
+               err = -EINVAL;
+               goto err1;
+       }
+
+       if (unlikely(qp->srq)) {
+               *bad_wr = wr;
+               err = -EINVAL;
+               goto err1;
+       }
+
+       spin_lock_irqsave(&rq->producer_lock, flags);
+
+       while (wr) {
+               err = post_one_recv(rq, wr);
+               if (unlikely(err)) {
+                       *bad_wr = wr;
+                       break;
+               }
+               wr = wr->next;
+       }
+
+       spin_unlock_irqrestore(&rq->producer_lock, flags);
+
+err1:
+       return err;
+}
+
+static struct ib_cq *rxe_create_cq(struct ib_device *dev,
+                                  const struct ib_cq_init_attr *attr,
+                                  struct ib_ucontext *context,
+                                  struct ib_udata *udata)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(dev);
+       struct rxe_cq *cq;
+
+       if (attr->flags)
+               return ERR_PTR(-EINVAL);
+
+       err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector, udata);
+       if (err)
+               goto err1;
+
+       cq = rxe_alloc(&rxe->cq_pool);
+       if (!cq) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector,
+                              context, udata);
+       if (err)
+               goto err2;
+
+       return &cq->ibcq;
+
+err2:
+       rxe_drop_ref(cq);
+err1:
+       return ERR_PTR(err);
+}
+
+static int rxe_destroy_cq(struct ib_cq *ibcq)
+{
+       struct rxe_cq *cq = to_rcq(ibcq);
+
+       rxe_drop_ref(cq);
+       return 0;
+}
+
+static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+       int err;
+       struct rxe_cq *cq = to_rcq(ibcq);
+       struct rxe_dev *rxe = to_rdev(ibcq->device);
+
+       err = rxe_cq_chk_attr(rxe, cq, cqe, 0, udata);
+       if (err)
+               goto err1;
+
+       err = rxe_cq_resize_queue(cq, cqe, udata);
+       if (err)
+               goto err1;
+
+       return 0;
+
+err1:
+       return err;
+}
+
+static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+       int i;
+       struct rxe_cq *cq = to_rcq(ibcq);
+       struct rxe_cqe *cqe;
+       unsigned long flags;
+
+       spin_lock_irqsave(&cq->cq_lock, flags);
+       for (i = 0; i < num_entries; i++) {
+               cqe = queue_head(cq->queue);
+               if (!cqe)
+                       break;
+
+               memcpy(wc++, &cqe->ibwc, sizeof(*wc));
+               advance_consumer(cq->queue);
+       }
+       spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+       return i;
+}
+
+static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt)
+{
+       struct rxe_cq *cq = to_rcq(ibcq);
+       int count = queue_count(cq->queue);
+
+       return (count > wc_cnt) ? wc_cnt : count;
+}
+
+static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+       struct rxe_cq *cq = to_rcq(ibcq);
+
+       if (cq->notify != IB_CQ_NEXT_COMP)
+               cq->notify = flags & IB_CQ_SOLICITED_MASK;
+
+       return 0;
+}
+
+static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access)
+{
+       struct rxe_dev *rxe = to_rdev(ibpd->device);
+       struct rxe_pd *pd = to_rpd(ibpd);
+       struct rxe_mem *mr;
+       int err;
+
+       mr = rxe_alloc(&rxe->mr_pool);
+       if (!mr) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       rxe_add_index(mr);
+
+       rxe_add_ref(pd);
+
+       err = rxe_mem_init_dma(rxe, pd, access, mr);
+       if (err)
+               goto err2;
+
+       return &mr->ibmr;
+
+err2:
+       rxe_drop_ref(pd);
+       rxe_drop_index(mr);
+       rxe_drop_ref(mr);
+err1:
+       return ERR_PTR(err);
+}
+
+static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd,
+                                    u64 start,
+                                    u64 length,
+                                    u64 iova,
+                                    int access, struct ib_udata *udata)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibpd->device);
+       struct rxe_pd *pd = to_rpd(ibpd);
+       struct rxe_mem *mr;
+
+       mr = rxe_alloc(&rxe->mr_pool);
+       if (!mr) {
+               err = -ENOMEM;
+               goto err2;
+       }
+
+       rxe_add_index(mr);
+
+       rxe_add_ref(pd);
+
+       err = rxe_mem_init_user(rxe, pd, start, length, iova,
+                               access, udata, mr);
+       if (err)
+               goto err3;
+
+       return &mr->ibmr;
+
+err3:
+       rxe_drop_ref(pd);
+       rxe_drop_index(mr);
+       rxe_drop_ref(mr);
+err2:
+       return ERR_PTR(err);
+}
+
+static int rxe_dereg_mr(struct ib_mr *ibmr)
+{
+       struct rxe_mem *mr = to_rmr(ibmr);
+
+       mr->state = RXE_MEM_STATE_ZOMBIE;
+       rxe_drop_ref(mr->pd);
+       rxe_drop_index(mr);
+       rxe_drop_ref(mr);
+       return 0;
+}
+
+static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd,
+                                 enum ib_mr_type mr_type,
+                                 u32 max_num_sg)
+{
+       struct rxe_dev *rxe = to_rdev(ibpd->device);
+       struct rxe_pd *pd = to_rpd(ibpd);
+       struct rxe_mem *mr;
+       int err;
+
+       if (mr_type != IB_MR_TYPE_MEM_REG)
+               return ERR_PTR(-EINVAL);
+
+       mr = rxe_alloc(&rxe->mr_pool);
+       if (!mr) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       rxe_add_index(mr);
+
+       rxe_add_ref(pd);
+
+       err = rxe_mem_init_fast(rxe, pd, max_num_sg, mr);
+       if (err)
+               goto err2;
+
+       return &mr->ibmr;
+
+err2:
+       rxe_drop_ref(pd);
+       rxe_drop_index(mr);
+       rxe_drop_ref(mr);
+err1:
+       return ERR_PTR(err);
+}
+
+static int rxe_set_page(struct ib_mr *ibmr, u64 addr)
+{
+       struct rxe_mem *mr = to_rmr(ibmr);
+       struct rxe_map *map;
+       struct rxe_phys_buf *buf;
+
+       if (unlikely(mr->nbuf == mr->num_buf))
+               return -ENOMEM;
+
+       map = mr->map[mr->nbuf / RXE_BUF_PER_MAP];
+       buf = &map->buf[mr->nbuf % RXE_BUF_PER_MAP];
+
+       buf->addr = addr;
+       buf->size = ibmr->page_size;
+       mr->nbuf++;
+
+       return 0;
+}
+
+static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+                        unsigned int *sg_offset)
+{
+       struct rxe_mem *mr = to_rmr(ibmr);
+       int n;
+
+       mr->nbuf = 0;
+
+       n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_set_page);
+
+       mr->va = ibmr->iova;
+       mr->iova = ibmr->iova;
+       mr->length = ibmr->length;
+       mr->page_shift = ilog2(ibmr->page_size);
+       mr->page_mask = ibmr->page_size - 1;
+       mr->offset = mr->iova & mr->page_mask;
+
+       return n;
+}
+
+static int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
+{
+       int err;
+       struct rxe_dev *rxe = to_rdev(ibqp->device);
+       struct rxe_qp *qp = to_rqp(ibqp);
+       struct rxe_mc_grp *grp;
+
+       /* takes a ref on grp if successful */
+       err = rxe_mcast_get_grp(rxe, mgid, &grp);
+       if (err)
+               return err;
+
+       err = rxe_mcast_add_grp_elem(rxe, qp, grp);
+
+       rxe_drop_ref(grp);
+       return err;
+}
+
+static int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
+{
+       struct rxe_dev *rxe = to_rdev(ibqp->device);
+       struct rxe_qp *qp = to_rqp(ibqp);
+
+       return rxe_mcast_drop_grp_elem(rxe, qp, mgid);
+}
+
+static ssize_t rxe_show_parent(struct device *device,
+                              struct device_attribute *attr, char *buf)
+{
+       struct rxe_dev *rxe = container_of(device, struct rxe_dev,
+                                          ib_dev.dev);
+       char *name;
+
+       name = rxe->ifc_ops->parent_name(rxe, 1);
+       return snprintf(buf, 16, "%s\n", name);
+}
+
+static DEVICE_ATTR(parent, S_IRUGO, rxe_show_parent, NULL);
+
+static struct device_attribute *rxe_dev_attributes[] = {
+       &dev_attr_parent,
+};
+
+int rxe_register_device(struct rxe_dev *rxe)
+{
+       int err;
+       int i;
+       struct ib_device *dev = &rxe->ib_dev;
+
+       strlcpy(dev->name, "rxe%d", IB_DEVICE_NAME_MAX);
+       strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc));
+
+       dev->owner = THIS_MODULE;
+       dev->node_type = RDMA_NODE_IB_CA;
+       dev->phys_port_cnt = 1;
+       dev->num_comp_vectors = RXE_NUM_COMP_VECTORS;
+       dev->dma_device = rxe->ifc_ops->dma_device(rxe);
+       dev->local_dma_lkey = 0;
+       dev->node_guid = rxe->ifc_ops->node_guid(rxe);
+       dev->dma_ops = &rxe_dma_mapping_ops;
+
+       dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION;
+       dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT)
+           | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)
+           | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE)
+           | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT)
+           | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD)
+           | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD)
+           | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_POST_SRQ_RECV)
+           | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP)
+           | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP)
+           | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP)
+           | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP)
+           | BIT_ULL(IB_USER_VERBS_CMD_POST_SEND)
+           | BIT_ULL(IB_USER_VERBS_CMD_POST_RECV)
+           | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_PEEK_CQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)
+           | BIT_ULL(IB_USER_VERBS_CMD_REG_MR)
+           | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR)
+           | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH)
+           | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_AH)
+           | BIT_ULL(IB_USER_VERBS_CMD_QUERY_AH)
+           | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH)
+           | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST)
+           | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST)
+           ;
+
+       dev->query_device = rxe_query_device;
+       dev->modify_device = rxe_modify_device;
+       dev->query_port = rxe_query_port;
+       dev->modify_port = rxe_modify_port;
+       dev->get_link_layer = rxe_get_link_layer;
+       dev->query_gid = rxe_query_gid;
+       dev->get_netdev = rxe_get_netdev;
+       dev->add_gid = rxe_add_gid;
+       dev->del_gid = rxe_del_gid;
+       dev->query_pkey = rxe_query_pkey;
+       dev->alloc_ucontext = rxe_alloc_ucontext;
+       dev->dealloc_ucontext = rxe_dealloc_ucontext;
+       dev->mmap = rxe_mmap;
+       dev->get_port_immutable = rxe_port_immutable;
+       dev->alloc_pd = rxe_alloc_pd;
+       dev->dealloc_pd = rxe_dealloc_pd;
+       dev->create_ah = rxe_create_ah;
+       dev->modify_ah = rxe_modify_ah;
+       dev->query_ah = rxe_query_ah;
+       dev->destroy_ah = rxe_destroy_ah;
+       dev->create_srq = rxe_create_srq;
+       dev->modify_srq = rxe_modify_srq;
+       dev->query_srq = rxe_query_srq;
+       dev->destroy_srq = rxe_destroy_srq;
+       dev->post_srq_recv = rxe_post_srq_recv;
+       dev->create_qp = rxe_create_qp;
+       dev->modify_qp = rxe_modify_qp;
+       dev->query_qp = rxe_query_qp;
+       dev->destroy_qp = rxe_destroy_qp;
+       dev->post_send = rxe_post_send;
+       dev->post_recv = rxe_post_recv;
+       dev->create_cq = rxe_create_cq;
+       dev->destroy_cq = rxe_destroy_cq;
+       dev->resize_cq = rxe_resize_cq;
+       dev->poll_cq = rxe_poll_cq;
+       dev->peek_cq = rxe_peek_cq;
+       dev->req_notify_cq = rxe_req_notify_cq;
+       dev->get_dma_mr = rxe_get_dma_mr;
+       dev->reg_user_mr = rxe_reg_user_mr;
+       dev->dereg_mr = rxe_dereg_mr;
+       dev->alloc_mr = rxe_alloc_mr;
+       dev->map_mr_sg = rxe_map_mr_sg;
+       dev->attach_mcast = rxe_attach_mcast;
+       dev->detach_mcast = rxe_detach_mcast;
+
+       err = ib_register_device(dev, NULL);
+       if (err) {
+               pr_warn("rxe_register_device failed, err = %d\n", err);
+               goto err1;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) {
+               err = device_create_file(&dev->dev, rxe_dev_attributes[i]);
+               if (err) {
+                       pr_warn("device_create_file failed, i = %d, err = %d\n",
+                               i, err);
+                       goto err2;
+               }
+       }
+
+       return 0;
+
+err2:
+       ib_unregister_device(dev);
+err1:
+       return err;
+}
+
+int rxe_unregister_device(struct rxe_dev *rxe)
+{
+       int i;
+       struct ib_device *dev = &rxe->ib_dev;
+
+       for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i)
+               device_remove_file(&dev->dev, rxe_dev_attributes[i]);
+
+       ib_unregister_device(dev);
+
+       return 0;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h

new file mode 100644 (file)

index 0000000..cac1d52
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *        Redistribution and use in source and binary forms, with or
+ *        without modification, are permitted provided that the following
+ *        conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_VERBS_H
+#define RXE_VERBS_H
+
+#include <linux/interrupt.h>
+#include <rdma/rdma_user_rxe.h>
+#include "rxe_pool.h"
+#include "rxe_task.h"
+
+static inline int pkey_match(u16 key1, u16 key2)
+{
+       return (((key1 & 0x7fff) != 0) &&
+               ((key1 & 0x7fff) == (key2 & 0x7fff)) &&
+               ((key1 & 0x8000) || (key2 & 0x8000))) ? 1 : 0;
+}
+
+/* Return >0 if psn_a > psn_b
+ *        0 if psn_a == psn_b
+ *       <0 if psn_a < psn_b
+ */
+static inline int psn_compare(u32 psn_a, u32 psn_b)
+{
+       s32 diff;
+
+       diff = (psn_a - psn_b) << 8;
+       return diff;
+}
+
+struct rxe_ucontext {
+       struct rxe_pool_entry   pelem;
+       struct ib_ucontext      ibuc;
+};
+
+struct rxe_pd {
+       struct rxe_pool_entry   pelem;
+       struct ib_pd            ibpd;
+};
+
+struct rxe_ah {
+       struct rxe_pool_entry   pelem;
+       struct ib_ah            ibah;
+       struct rxe_pd           *pd;
+       struct rxe_av           av;
+};
+
+struct rxe_cqe {
+       union {
+               struct ib_wc            ibwc;
+               struct ib_uverbs_wc     uibwc;
+       };
+};
+
+struct rxe_cq {
+       struct rxe_pool_entry   pelem;
+       struct ib_cq            ibcq;
+       struct rxe_queue        *queue;
+       spinlock_t              cq_lock;
+       u8                      notify;
+       int                     is_user;
+       struct tasklet_struct   comp_task;
+};
+
+enum wqe_state {
+       wqe_state_posted,
+       wqe_state_processing,
+       wqe_state_pending,
+       wqe_state_done,
+       wqe_state_error,
+};
+
+struct rxe_sq {
+       int                     max_wr;
+       int                     max_sge;
+       int                     max_inline;
+       spinlock_t              sq_lock; /* guard queue */
+       struct rxe_queue        *queue;
+};
+
+struct rxe_rq {
+       int                     max_wr;
+       int                     max_sge;
+       spinlock_t              producer_lock; /* guard queue producer */
+       spinlock_t              consumer_lock; /* guard queue consumer */
+       struct rxe_queue        *queue;
+};
+
+struct rxe_srq {
+       struct rxe_pool_entry   pelem;
+       struct ib_srq           ibsrq;
+       struct rxe_pd           *pd;
+       struct rxe_rq           rq;
+       u32                     srq_num;
+
+       int                     limit;
+       int                     error;
+};
+
+enum rxe_qp_state {
+       QP_STATE_RESET,
+       QP_STATE_INIT,
+       QP_STATE_READY,
+       QP_STATE_DRAIN,         /* req only */
+       QP_STATE_DRAINED,       /* req only */
+       QP_STATE_ERROR
+};
+
+extern char *rxe_qp_state_name[];
+
+struct rxe_req_info {
+       enum rxe_qp_state       state;
+       int                     wqe_index;
+       u32                     psn;
+       int                     opcode;
+       atomic_t                rd_atomic;
+       int                     wait_fence;
+       int                     need_rd_atomic;
+       int                     wait_psn;
+       int                     need_retry;
+       int                     noack_pkts;
+       struct rxe_task         task;
+};
+
+struct rxe_comp_info {
+       u32                     psn;
+       int                     opcode;
+       int                     timeout;
+       int                     timeout_retry;
+       u32                     retry_cnt;
+       u32                     rnr_retry;
+       struct rxe_task         task;
+};
+
+enum rdatm_res_state {
+       rdatm_res_state_next,
+       rdatm_res_state_new,
+       rdatm_res_state_replay,
+};
+
+struct resp_res {
+       int                     type;
+       u32                     first_psn;
+       u32                     last_psn;
+       u32                     cur_psn;
+       enum rdatm_res_state    state;
+
+       union {
+               struct {
+                       struct sk_buff  *skb;
+               } atomic;
+               struct {
+                       struct rxe_mem  *mr;
+                       u64             va_org;
+                       u32             rkey;
+                       u32             length;
+                       u64             va;
+                       u32             resid;
+               } read;
+       };
+};
+
+struct rxe_resp_info {
+       enum rxe_qp_state       state;
+       u32                     msn;
+       u32                     psn;
+       int                     opcode;
+       int                     drop_msg;
+       int                     goto_error;
+       int                     sent_psn_nak;
+       enum ib_wc_status       status;
+       u8                      aeth_syndrome;
+
+       /* Receive only */
+       struct rxe_recv_wqe     *wqe;
+
+       /* RDMA read / atomic only */
+       u64                     va;
+       struct rxe_mem          *mr;
+       u32                     resid;
+       u32                     rkey;
+       u64                     atomic_orig;
+
+       /* SRQ only */
+       struct {
+               struct rxe_recv_wqe     wqe;
+               struct ib_sge           sge[RXE_MAX_SGE];
+       } srq_wqe;
+
+       /* Responder resources. It's a circular list where the oldest
+        * resource is dropped first.
+        */
+       struct resp_res         *resources;
+       unsigned int            res_head;
+       unsigned int            res_tail;
+       struct resp_res         *res;
+       struct rxe_task         task;
+};
+
+struct rxe_qp {
+       struct rxe_pool_entry   pelem;
+       struct ib_qp            ibqp;
+       struct ib_qp_attr       attr;
+       unsigned int            valid;
+       unsigned int            mtu;
+       int                     is_user;
+
+       struct rxe_pd           *pd;
+       struct rxe_srq          *srq;
+       struct rxe_cq           *scq;
+       struct rxe_cq           *rcq;
+
+       enum ib_sig_type        sq_sig_type;
+
+       struct rxe_sq           sq;
+       struct rxe_rq           rq;
+
+       struct socket           *sk;
+
+       struct rxe_av           pri_av;
+       struct rxe_av           alt_av;
+
+       /* list of mcast groups qp has joined (for cleanup) */
+       struct list_head        grp_list;
+       spinlock_t              grp_lock; /* guard grp_list */
+
+       struct sk_buff_head     req_pkts;
+       struct sk_buff_head     resp_pkts;
+       struct sk_buff_head     send_pkts;
+
+       struct rxe_req_info     req;
+       struct rxe_comp_info    comp;
+       struct rxe_resp_info    resp;
+
+       atomic_t                ssn;
+       atomic_t                skb_out;
+       int                     need_req_skb;
+
+       /* Timer for retranmitting packet when ACKs have been lost. RC
+        * only. The requester sets it when it is not already
+        * started. The responder resets it whenever an ack is
+        * received.
+        */
+       struct timer_list retrans_timer;
+       u64 qp_timeout_jiffies;
+
+       /* Timer for handling RNR NAKS. */
+       struct timer_list rnr_nak_timer;
+
+       spinlock_t              state_lock; /* guard requester and completer */
+};
+
+enum rxe_mem_state {
+       RXE_MEM_STATE_ZOMBIE,
+       RXE_MEM_STATE_INVALID,
+       RXE_MEM_STATE_FREE,
+       RXE_MEM_STATE_VALID,
+};
+
+enum rxe_mem_type {
+       RXE_MEM_TYPE_NONE,
+       RXE_MEM_TYPE_DMA,
+       RXE_MEM_TYPE_MR,
+       RXE_MEM_TYPE_FMR,
+       RXE_MEM_TYPE_MW,
+};
+
+#define RXE_BUF_PER_MAP                (PAGE_SIZE / sizeof(struct rxe_phys_buf))
+
+struct rxe_phys_buf {
+       u64      addr;
+       u64      size;
+};
+
+struct rxe_map {
+       struct rxe_phys_buf     buf[RXE_BUF_PER_MAP];
+};
+
+struct rxe_mem {
+       struct rxe_pool_entry   pelem;
+       union {
+               struct ib_mr            ibmr;
+               struct ib_mw            ibmw;
+       };
+
+       struct rxe_pd           *pd;
+       struct ib_umem          *umem;
+
+       u32                     lkey;
+       u32                     rkey;
+
+       enum rxe_mem_state      state;
+       enum rxe_mem_type       type;
+       u64                     va;
+       u64                     iova;
+       size_t                  length;
+       u32                     offset;
+       int                     access;
+
+       int                     page_shift;
+       int                     page_mask;
+       int                     map_shift;
+       int                     map_mask;
+
+       u32                     num_buf;
+       u32                     nbuf;
+
+       u32                     max_buf;
+       u32                     num_map;
+
+       struct rxe_map          **map;
+};
+
+struct rxe_mc_grp {
+       struct rxe_pool_entry   pelem;
+       spinlock_t              mcg_lock; /* guard group */
+       struct rxe_dev          *rxe;
+       struct list_head        qp_list;
+       union ib_gid            mgid;
+       int                     num_qp;
+       u32                     qkey;
+       u16                     pkey;
+};
+
+struct rxe_mc_elem {
+       struct rxe_pool_entry   pelem;
+       struct list_head        qp_list;
+       struct list_head        grp_list;
+       struct rxe_qp           *qp;
+       struct rxe_mc_grp       *grp;
+};
+
+struct rxe_port {
+       struct ib_port_attr     attr;
+       u16                     *pkey_tbl;
+       __be64                  port_guid;
+       __be64                  subnet_prefix;
+       spinlock_t              port_lock; /* guard port */
+       unsigned int            mtu_cap;
+       /* special QPs */
+       u32                     qp_smi_index;
+       u32                     qp_gsi_index;
+};
+
+/* callbacks from rdma_rxe to network interface layer */
+struct rxe_ifc_ops {
+       void (*release)(struct rxe_dev *rxe);
+       __be64 (*node_guid)(struct rxe_dev *rxe);
+       __be64 (*port_guid)(struct rxe_dev *rxe);
+       struct device *(*dma_device)(struct rxe_dev *rxe);
+       int (*mcast_add)(struct rxe_dev *rxe, union ib_gid *mgid);
+       int (*mcast_delete)(struct rxe_dev *rxe, union ib_gid *mgid);
+       int (*prepare)(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+                      struct sk_buff *skb, u32 *crc);
+       int (*send)(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+                   struct sk_buff *skb);
+       int (*loopback)(struct sk_buff *skb);
+       struct sk_buff *(*init_packet)(struct rxe_dev *rxe, struct rxe_av *av,
+                                      int paylen, struct rxe_pkt_info *pkt);
+       char *(*parent_name)(struct rxe_dev *rxe, unsigned int port_num);
+       enum rdma_link_layer (*link_layer)(struct rxe_dev *rxe,
+                                          unsigned int port_num);
+};
+
+struct rxe_dev {
+       struct ib_device        ib_dev;
+       struct ib_device_attr   attr;
+       int                     max_ucontext;
+       int                     max_inline_data;
+       struct kref             ref_cnt;
+       struct mutex    usdev_lock;
+
+       struct rxe_ifc_ops      *ifc_ops;
+
+       struct net_device       *ndev;
+
+       int                     xmit_errors;
+
+       struct rxe_pool         uc_pool;
+       struct rxe_pool         pd_pool;
+       struct rxe_pool         ah_pool;
+       struct rxe_pool         srq_pool;
+       struct rxe_pool         qp_pool;
+       struct rxe_pool         cq_pool;
+       struct rxe_pool         mr_pool;
+       struct rxe_pool         mw_pool;
+       struct rxe_pool         mc_grp_pool;
+       struct rxe_pool         mc_elem_pool;
+
+       spinlock_t              pending_lock; /* guard pending_mmaps */
+       struct list_head        pending_mmaps;
+
+       spinlock_t              mmap_offset_lock; /* guard mmap_offset */
+       int                     mmap_offset;
+
+       struct rxe_port         port;
+       struct list_head        list;
+};
+
+static inline struct rxe_dev *to_rdev(struct ib_device *dev)
+{
+       return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL;
+}
+
+static inline struct rxe_ucontext *to_ruc(struct ib_ucontext *uc)
+{
+       return uc ? container_of(uc, struct rxe_ucontext, ibuc) : NULL;
+}
+
+static inline struct rxe_pd *to_rpd(struct ib_pd *pd)
+{
+       return pd ? container_of(pd, struct rxe_pd, ibpd) : NULL;
+}
+
+static inline struct rxe_ah *to_rah(struct ib_ah *ah)
+{
+       return ah ? container_of(ah, struct rxe_ah, ibah) : NULL;
+}
+
+static inline struct rxe_srq *to_rsrq(struct ib_srq *srq)
+{
+       return srq ? container_of(srq, struct rxe_srq, ibsrq) : NULL;
+}
+
+static inline struct rxe_qp *to_rqp(struct ib_qp *qp)
+{
+       return qp ? container_of(qp, struct rxe_qp, ibqp) : NULL;
+}
+
+static inline struct rxe_cq *to_rcq(struct ib_cq *cq)
+{
+       return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL;
+}
+
+static inline struct rxe_mem *to_rmr(struct ib_mr *mr)
+{
+       return mr ? container_of(mr, struct rxe_mem, ibmr) : NULL;
+}
+
+static inline struct rxe_mem *to_rmw(struct ib_mw *mw)
+{
+       return mw ? container_of(mw, struct rxe_mem, ibmw) : NULL;
+}
+
+int rxe_register_device(struct rxe_dev *rxe);
+int rxe_unregister_device(struct rxe_dev *rxe);
+
+void rxe_mc_cleanup(void *arg);
+
+#endif /* RXE_VERBS_H */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c

index 1502199c8e56272584e7fb5797031f369a67fcd2..7b6d40ff1acf8e3a80269a713705e20655def9da 100644 (file)
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -62,10 +62,8 @@ static void ipoib_get_drvinfo(struct net_device *netdev,
  {
         struct ipoib_dev_priv *priv = netdev_priv(netdev);
  
-       snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
-                "%d.%d.%d", (int)(priv->ca->attrs.fw_ver >> 32),
-                (int)(priv->ca->attrs.fw_ver >> 16) & 0xffff,
-                (int)priv->ca->attrs.fw_ver & 0xffff);
+       ib_get_device_fw_str(priv->ca, drvinfo->fw_version,
+                            sizeof(drvinfo->fw_version));
  
         strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device),
                 sizeof(drvinfo->bus_info));
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c

index 5f58c41ef787d22692920855f397c1bbcb079e6e..74bcaa0642261e672b6747e731d9c9ca56befd07 100644 (file)
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1967,8 +1967,7 @@ int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
         priv->hca_caps = hca->attrs.device_cap_flags;
  
         if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
-               priv->dev->hw_features = NETIF_F_SG |
-                       NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
+               priv->dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
  
                 if (priv->hca_caps & IB_DEVICE_UD_TSO)
                         priv->dev->hw_features |= NETIF_F_TSO;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c

index 1e7cbbaa15bd0c0369c9595517bd5471f3fa4612..c55ecb2c3736cedfe6ffb5bff4d2c44f44c167fb 100644 (file)
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -135,7 +135,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
                 .cap = {
                         .max_send_wr  = ipoib_sendq_size,
                         .max_recv_wr  = ipoib_recvq_size,
-                       .max_send_sge = 1,
+                       .max_send_sge = min_t(u32, priv->ca->attrs.max_sge,
+                                             MAX_SKB_FRAGS + 1),
                         .max_recv_sge = IPOIB_UD_RX_SG
                 },
                 .sq_sig_type = IB_SIGNAL_ALL_WR,
@@ -205,10 +206,6 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
         if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING)
                 init_attr.create_flags |= IB_QP_CREATE_NETIF_QP;
  
-       if (dev->features & NETIF_F_SG)
-               init_attr.cap.max_send_sge =
-                       min_t(u32, priv->ca->attrs.max_sge, MAX_SKB_FRAGS + 1);
-
         priv->qp = ib_create_qp(priv->pd, &init_attr);
         if (IS_ERR(priv->qp)) {
                 printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
@@ -234,6 +231,9 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
         priv->rx_wr.next = NULL;
         priv->rx_wr.sg_list = priv->rx_sge;
  
+       if (init_attr.cap.max_send_sge > 1)
+               dev->features |= NETIF_F_SG;
+
         priv->max_send_sge = init_attr.cap.max_send_sge;
  
         return 0;
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c

index a990c04208c94e1d79a28442381c5c6fbb522077..ba6be060a476b19d608c7797bed4526b7df04332 100644 (file)
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -137,8 +137,6 @@ isert_create_qp(struct isert_conn *isert_conn,
         attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1;
         attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX;
         attr.cap.max_send_sge = device->ib_device->attrs.max_sge;
-       isert_conn->max_sge = min(device->ib_device->attrs.max_sge,
-                                 device->ib_device->attrs.max_sge_rd);
         attr.cap.max_recv_sge = 1;
         attr.sq_sig_type = IB_SIGNAL_REQ_WR;
         attr.qp_type = IB_QPT_RC;
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h

index e512ba941f2f980d659b3e75ec31fb708964ff01..fc791efe3a108178f1949f386548ed84ae81ebc8 100644 (file)
--- a/drivers/infiniband/ulp/isert/ib_isert.h
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -138,7 +138,6 @@ struct isert_conn {
         u32                     responder_resources;
         u32                     initiator_depth;
         bool                    pi_support;
-       u32                     max_sge;
         struct iser_rx_desc     *login_req_buf;
         char                    *login_rsp_buf;
         u64                     login_req_dma;
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c

index 4a4155640d516252fba99c5728bb794e005218cf..dfa23b075a88469b73c3f199f73c8b8ed571187f 100644 (file)
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -1601,6 +1601,7 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
         struct ib_qp_init_attr *qp_init;
         struct srpt_port *sport = ch->sport;
         struct srpt_device *sdev = sport->sdev;
+       const struct ib_device_attr *attrs = &sdev->device->attrs;
         u32 srp_sq_size = sport->port_attrib.srp_sq_size;
         int ret;
  
@@ -1638,7 +1639,7 @@ retry:
          */
         qp_init->cap.max_send_wr = srp_sq_size / 2;
         qp_init->cap.max_rdma_ctxs = srp_sq_size / 2;
-       qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE;
+       qp_init->cap.max_send_sge = min(attrs->max_sge, SRPT_MAX_SG_PER_WQE);
         qp_init->port_num = ch->sport->port;
  
         ch->qp = ib_create_qp(sdev->pd, qp_init);
@@ -2261,7 +2262,7 @@ static void srpt_queue_response(struct se_cmd *cmd)
                 container_of(cmd, struct srpt_send_ioctx, cmd);
         struct srpt_rdma_ch *ch = ioctx->ch;
         struct srpt_device *sdev = ch->sport->sdev;
-       struct ib_send_wr send_wr, *first_wr = NULL, *bad_wr;
+       struct ib_send_wr send_wr, *first_wr = &send_wr, *bad_wr;
         struct ib_sge sge;
         enum srpt_command_state state;
         unsigned long flags;
@@ -2302,11 +2303,8 @@ static void srpt_queue_response(struct se_cmd *cmd)
                         struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
  
                         first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp,
-                                       ch->sport->port, NULL,
-                                       first_wr ? first_wr : &send_wr);
+                                       ch->sport->port, NULL, first_wr);
                 }
-       } else {
-               first_wr = &send_wr;
         }
  
         if (state != SRPT_STATE_MGMT)
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h

index 389030487da7eecb521342e8a83859b39b3523a6..581878782854322301dc85ba1e6c763888da0827 100644 (file)
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -106,7 +106,11 @@ enum {
         SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2,
  
         SRPT_DEF_SG_TABLESIZE = 128,
-       SRPT_DEF_SG_PER_WQE = 16,
+       /*
+        * An experimentally determined value that avoids that QP creation
+        * fails due to "swiotlb buffer is full" on systems using the swiotlb.
+        */
+       SRPT_MAX_SG_PER_WQE = 16,
  
         MIN_SRPT_SQ_SIZE = 16,
         DEF_SRPT_SQ_SIZE = 4096,
diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c

index a529a4535457217e761fd1e8f8b0748cb48a5498..83af17ad0f1f131d6d397e3df2e15521d0c24b55 100644 (file)
--- a/drivers/input/joystick/xpad.c
+++ b/drivers/input/joystick/xpad.c
@@ -115,6 +115,10 @@ static bool sticks_to_null;
  module_param(sticks_to_null, bool, S_IRUGO);
  MODULE_PARM_DESC(sticks_to_null, "Do not map sticks at all for unknown pads");
  
+static bool auto_poweroff = true;
+module_param(auto_poweroff, bool, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(auto_poweroff, "Power off wireless controllers on suspend");
+
  static const struct xpad_device {
         u16 idVendor;
         u16 idProduct;
@@ -1248,6 +1252,36 @@ static void xpad_stop_input(struct usb_xpad *xpad)
         usb_kill_urb(xpad->irq_in);
  }
  
+static void xpad360w_poweroff_controller(struct usb_xpad *xpad)
+{
+       unsigned long flags;
+       struct xpad_output_packet *packet =
+                       &xpad->out_packets[XPAD_OUT_CMD_IDX];
+
+       spin_lock_irqsave(&xpad->odata_lock, flags);
+
+       packet->data[0] = 0x00;
+       packet->data[1] = 0x00;
+       packet->data[2] = 0x08;
+       packet->data[3] = 0xC0;
+       packet->data[4] = 0x00;
+       packet->data[5] = 0x00;
+       packet->data[6] = 0x00;
+       packet->data[7] = 0x00;
+       packet->data[8] = 0x00;
+       packet->data[9] = 0x00;
+       packet->data[10] = 0x00;
+       packet->data[11] = 0x00;
+       packet->len = 12;
+       packet->pending = true;
+
+       /* Reset the sequence so we send out poweroff now */
+       xpad->last_out_packet = -1;
+       xpad_try_sending_next_out_packet(xpad);
+
+       spin_unlock_irqrestore(&xpad->odata_lock, flags);
+}
+
  static int xpad360w_start_input(struct usb_xpad *xpad)
  {
         int error;
@@ -1590,6 +1624,15 @@ static int xpad_suspend(struct usb_interface *intf, pm_message_t message)
                  * or goes away.
                  */
                 xpad360w_stop_input(xpad);
+
+               /*
+                * The wireless adapter is going off now, so the
+                * gamepads are going to become disconnected.
+                * Unless explicitly disabled, power them down
+                * so they don't just sit there flashing.
+                */
+               if (auto_poweroff && xpad->pad_present)
+                       xpad360w_poweroff_controller(xpad);
         } else {
                 mutex_lock(&input->mutex);
                 if (input->users)
diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c

index b01966dc7eb3db697c6313b4963e79912990b98f..4b0878f3547171044600bd85d9a0c24c66accd3a 100644 (file)
--- a/drivers/input/keyboard/cros_ec_keyb.c
+++ b/drivers/input/keyboard/cros_ec_keyb.c
@@ -186,7 +186,7 @@ static irqreturn_t cros_ec_keyb_irq(int irq, void *data)
         if (ret >= 0)
                 cros_ec_keyb_process(ckdev, kb_state, ret);
         else
-               dev_err(ec->dev, "failed to get keyboard state: %d\n", ret);
+               dev_err(ckdev->dev, "failed to get keyboard state: %d\n", ret);
  
         return IRQ_HANDLED;
  }
@@ -236,7 +236,7 @@ static void cros_ec_keyb_compute_valid_keys(struct cros_ec_keyb *ckdev)
  static int cros_ec_keyb_probe(struct platform_device *pdev)
  {
         struct cros_ec_device *ec = dev_get_drvdata(pdev->dev.parent);
-       struct device *dev = ec->dev;
+       struct device *dev = &pdev->dev;
         struct cros_ec_keyb *ckdev;
         struct input_dev *idev;
         struct device_node *np;
@@ -246,23 +246,22 @@ static int cros_ec_keyb_probe(struct platform_device *pdev)
         if (!np)
                 return -ENODEV;
  
-       ckdev = devm_kzalloc(&pdev->dev, sizeof(*ckdev), GFP_KERNEL);
+       ckdev = devm_kzalloc(dev, sizeof(*ckdev), GFP_KERNEL);
         if (!ckdev)
                 return -ENOMEM;
-       err = matrix_keypad_parse_of_params(&pdev->dev, &ckdev->rows,
-                                           &ckdev->cols);
+       err = matrix_keypad_parse_of_params(dev, &ckdev->rows, &ckdev->cols);
         if (err)
                 return err;
  
-       ckdev->valid_keys = devm_kzalloc(&pdev->dev, ckdev->cols, GFP_KERNEL);
+       ckdev->valid_keys = devm_kzalloc(dev, ckdev->cols, GFP_KERNEL);
         if (!ckdev->valid_keys)
                 return -ENOMEM;
  
-       ckdev->old_kb_state = devm_kzalloc(&pdev->dev, ckdev->cols, GFP_KERNEL);
+       ckdev->old_kb_state = devm_kzalloc(dev, ckdev->cols, GFP_KERNEL);
         if (!ckdev->old_kb_state)
                 return -ENOMEM;
  
-       idev = devm_input_allocate_device(&pdev->dev);
+       idev = devm_input_allocate_device(dev);
         if (!idev)
                 return -ENOMEM;
  
@@ -273,7 +272,7 @@ static int cros_ec_keyb_probe(struct platform_device *pdev)
  
         ckdev->ec = ec;
         ckdev->dev = dev;
-       dev_set_drvdata(&pdev->dev, ckdev);
+       dev_set_drvdata(dev, ckdev);
  
         idev->name = CROS_EC_DEV_NAME;
         idev->phys = ec->phys_name;
@@ -282,7 +281,7 @@ static int cros_ec_keyb_probe(struct platform_device *pdev)
         idev->id.bustype = BUS_VIRTUAL;
         idev->id.version = 1;
         idev->id.product = 0;
-       idev->dev.parent = &pdev->dev;
+       idev->dev.parent = dev;
         idev->open = cros_ec_keyb_open;
         idev->close = cros_ec_keyb_close;
  
diff --git a/drivers/input/misc/rotary_encoder.c b/drivers/input/misc/rotary_encoder.c

index c7fc8d4fb08034fec95eb3e6e6f98f960d5262e6..1588aecafff79d8fdb54d80b7fa03a2ee9036f00 100644 (file)
--- a/drivers/input/misc/rotary_encoder.c
+++ b/drivers/input/misc/rotary_encoder.c
@@ -28,6 +28,11 @@
  
  #define DRV_NAME "rotary-encoder"
  
+enum rotary_encoder_encoding {
+       ROTENC_GRAY,
+       ROTENC_BINARY,
+};
+
  struct rotary_encoder {
         struct input_dev *input;
  
@@ -37,6 +42,7 @@ struct rotary_encoder {
         u32 axis;
         bool relative_axis;
         bool rollover;
+       enum rotary_encoder_encoding encoding;
  
         unsigned int pos;
  
@@ -57,8 +63,9 @@ static unsigned int rotary_encoder_get_state(struct rotary_encoder *encoder)
  
         for (i = 0; i < encoder->gpios->ndescs; ++i) {
                 int val = gpiod_get_value_cansleep(encoder->gpios->desc[i]);
+
                 /* convert from gray encoding to normal */
-               if (ret & 1)
+               if (encoder->encoding == ROTENC_GRAY && ret & 1)
                         val = !val;
  
                 ret = ret << 1 | val;
@@ -213,6 +220,20 @@ static int rotary_encoder_probe(struct platform_device *pdev)
         encoder->rollover =
                 device_property_read_bool(dev, "rotary-encoder,rollover");
  
+       if (!device_property_present(dev, "rotary-encoder,encoding") ||
+           !device_property_match_string(dev, "rotary-encoder,encoding",
+                                         "gray")) {
+               dev_info(dev, "gray");
+               encoder->encoding = ROTENC_GRAY;
+       } else if (!device_property_match_string(dev, "rotary-encoder,encoding",
+                                                "binary")) {
+               dev_info(dev, "binary");
+               encoder->encoding = ROTENC_BINARY;
+       } else {
+               dev_err(dev, "unknown encoding setting\n");
+               return -EINVAL;
+       }
+
         device_property_read_u32(dev, "linux,axis", &encoder->axis);
         encoder->relative_axis =
                 device_property_read_bool(dev, "rotary-encoder,relative-axis");
diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c

index 2f589857a0395d8c48f685f47bd1c9ceb8445213..d15b338130213c53489baa600d6dcea1500790d4 100644 (file)
--- a/drivers/input/mouse/elan_i2c_core.c
+++ b/drivers/input/mouse/elan_i2c_core.c
@@ -4,7 +4,8 @@
   * Copyright (c) 2013 ELAN Microelectronics Corp.
   *
   * Author: 林政維 (Duson Lin) <dusonlin@emc.com.tw>
- * Version: 1.6.0
+ * Author: KT Liao <kt.liao@emc.com.tw>
+ * Version: 1.6.2
   *
   * Based on cyapa driver:
   * copyright (c) 2011-2012 Cypress Semiconductor, Inc.
@@ -40,7 +41,7 @@
  #include "elan_i2c.h"
  
  #define DRIVER_NAME            "elan_i2c"
-#define ELAN_DRIVER_VERSION    "1.6.1"
+#define ELAN_DRIVER_VERSION    "1.6.2"
  #define ELAN_VENDOR_ID         0x04f3
  #define ETP_MAX_PRESSURE       255
  #define ETP_FWIDTH_REDUCE      90
@@ -199,9 +200,41 @@ static int elan_sleep(struct elan_tp_data *data)
         return error;
  }
  
+static int elan_query_product(struct elan_tp_data *data)
+{
+       int error;
+
+       error = data->ops->get_product_id(data->client, &data->product_id);
+       if (error)
+               return error;
+
+       error = data->ops->get_sm_version(data->client, &data->ic_type,
+                                         &data->sm_version);
+       if (error)
+               return error;
+
+       return 0;
+}
+
+static int elan_check_ASUS_special_fw(struct elan_tp_data *data)
+{
+       if (data->ic_type != 0x0E)
+               return false;
+
+       switch (data->product_id) {
+       case 0x05 ... 0x07:
+       case 0x09:
+       case 0x13:
+               return true;
+       default:
+               return false;
+       }
+}
+
  static int __elan_initialize(struct elan_tp_data *data)
  {
         struct i2c_client *client = data->client;
+       bool woken_up = false;
         int error;
  
         error = data->ops->initialize(client);
@@ -210,6 +243,27 @@ static int __elan_initialize(struct elan_tp_data *data)
                 return error;
         }
  
+       error = elan_query_product(data);
+       if (error)
+               return error;
+
+       /*
+        * Some ASUS devices were shipped with firmware that requires
+        * touchpads to be woken up first, before attempting to switch
+        * them into absolute reporting mode.
+        */
+       if (elan_check_ASUS_special_fw(data)) {
+               error = data->ops->sleep_control(client, false);
+               if (error) {
+                       dev_err(&client->dev,
+                               "failed to wake device up: %d\n", error);
+                       return error;
+               }
+
+               msleep(200);
+               woken_up = true;
+       }
+
         data->mode |= ETP_ENABLE_ABS;
         error = data->ops->set_mode(client, data->mode);
         if (error) {
@@ -218,11 +272,13 @@ static int __elan_initialize(struct elan_tp_data *data)
                 return error;
         }
  
-       error = data->ops->sleep_control(client, false);
-       if (error) {
-               dev_err(&client->dev,
-                       "failed to wake device up: %d\n", error);
-               return error;
+       if (!woken_up) {
+               error = data->ops->sleep_control(client, false);
+               if (error) {
+                       dev_err(&client->dev,
+                               "failed to wake device up: %d\n", error);
+                       return error;
+               }
         }
  
         return 0;
@@ -248,10 +304,6 @@ static int elan_query_device_info(struct elan_tp_data *data)
  {
         int error;
  
-       error = data->ops->get_product_id(data->client, &data->product_id);
-       if (error)
-               return error;
-
         error = data->ops->get_version(data->client, false, &data->fw_version);
         if (error)
                 return error;
@@ -261,11 +313,6 @@ static int elan_query_device_info(struct elan_tp_data *data)
         if (error)
                 return error;
  
-       error = data->ops->get_sm_version(data->client, &data->ic_type,
-                                         &data->sm_version);
-       if (error)
-               return error;
-
         error = data->ops->get_version(data->client, true, &data->iap_version);
         if (error)
                 return error;
diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c

index 615d23ec0d8e2c9bfaa7a69cf22b924a2de59ed4..08e252a424802df0885711509e3e26528ec7b7ba 100644 (file)
--- a/drivers/input/mouse/elantech.c
+++ b/drivers/input/mouse/elantech.c
@@ -222,12 +222,8 @@ static int elantech_write_reg(struct psmouse *psmouse, unsigned char reg,
   */
  static void elantech_packet_dump(struct psmouse *psmouse)
  {
-       int     i;
-
-       psmouse_printk(KERN_DEBUG, psmouse, "PS/2 packet [");
-       for (i = 0; i < psmouse->pktsize; i++)
-               printk("%s0x%02x ", i ? ", " : " ", psmouse->packet[i]);
-       printk("]\n");
+       psmouse_printk(KERN_DEBUG, psmouse, "PS/2 packet [%*ph]\n",
+                      psmouse->pktsize, psmouse->packet);
  }
  
  /*
diff --git a/drivers/input/rmi4/rmi_bus.c b/drivers/input/rmi4/rmi_bus.c

index 253df96be4276cc9196a6c0ab25b04f838d061ac..a73580654c6b0cd3fd8faaeda2322bf88fdcd1ed 100644 (file)
--- a/drivers/input/rmi4/rmi_bus.c
+++ b/drivers/input/rmi4/rmi_bus.c
@@ -232,10 +232,7 @@ err_put_device:
  void rmi_unregister_function(struct rmi_function *fn)
  {
         device_del(&fn->dev);
-
-       if (fn->dev.of_node)
-               of_node_put(fn->dev.of_node);
-
+       of_node_put(fn->dev.of_node);
         put_device(&fn->dev);
  }
  
diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c

index 454195709a824b3e5a346b3aa9e087672dccdcae..b4d34086e73f5aac0654e607028f13f59ef771be 100644 (file)
--- a/drivers/input/serio/i8042.c
+++ b/drivers/input/serio/i8042.c
@@ -1277,6 +1277,7 @@ static int __init i8042_create_kbd_port(void)
         serio->start            = i8042_start;
         serio->stop             = i8042_stop;
         serio->close            = i8042_port_close;
+       serio->ps2_cmd_mutex    = &i8042_mutex;
         serio->port_data        = port;
         serio->dev.parent       = &i8042_platform_device->dev;
         strlcpy(serio->name, "i8042 KBD port", sizeof(serio->name));
@@ -1373,21 +1374,6 @@ static void i8042_unregister_ports(void)
         }
  }
  
-/*
- * Checks whether port belongs to i8042 controller.
- */
-bool i8042_check_port_owner(const struct serio *port)
-{
-       int i;
-
-       for (i = 0; i < I8042_NUM_PORTS; i++)
-               if (i8042_ports[i].serio == port)
-                       return true;
-
-       return false;
-}
-EXPORT_SYMBOL(i8042_check_port_owner);
-
  static void i8042_free_irqs(void)
  {
         if (i8042_aux_irq_registered)
diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c

index 316f2c8971011dae527d506ee18d49ce96f316e0..83e9c663aa6727da129d2fe8b0256f73f4581d53 100644 (file)
--- a/drivers/input/serio/libps2.c
+++ b/drivers/input/serio/libps2.c
@@ -56,19 +56,17 @@ EXPORT_SYMBOL(ps2_sendbyte);
  
  void ps2_begin_command(struct ps2dev *ps2dev)
  {
-       mutex_lock(&ps2dev->cmd_mutex);
+       struct mutex *m = ps2dev->serio->ps2_cmd_mutex ?: &ps2dev->cmd_mutex;
  
-       if (i8042_check_port_owner(ps2dev->serio))
-               i8042_lock_chip();
+       mutex_lock(m);
  }
  EXPORT_SYMBOL(ps2_begin_command);
  
  void ps2_end_command(struct ps2dev *ps2dev)
  {
-       if (i8042_check_port_owner(ps2dev->serio))
-               i8042_unlock_chip();
+       struct mutex *m = ps2dev->serio->ps2_cmd_mutex ?: &ps2dev->cmd_mutex;
  
-       mutex_unlock(&ps2dev->cmd_mutex);
+       mutex_unlock(m);
  }
  EXPORT_SYMBOL(ps2_end_command);
  
diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig

index ee02dc7422bd1a9c03d9171b83a75e3032144bc5..2fb1f430a4318fcddf65a85a813aae96861da09c 100644 (file)
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -1059,6 +1059,31 @@ config TOUCHSCREEN_RM_TS
           To compile this driver as a module, choose M here: the
           module will be called raydium_i2c_ts.
  
+config TOUCHSCREEN_SILEAD
+       tristate "Silead I2C touchscreen"
+       depends on I2C
+       help
+         Say Y here if you have the Silead touchscreen connected to
+         your system.
+
+         If unsure, say N.
+
+         To compile this driver as a module, choose M here: the
+         module will be called silead.
+
+config TOUCHSCREEN_SIS_I2C
+       tristate "SiS 9200 family I2C touchscreen"
+       depends on I2C
+       select CRC_ITU_T
+       depends on GPIOLIB || COMPILE_TEST
+       help
+         This enables support for SiS 9200 family over I2C based touchscreens.
+
+         If unsure, say N.
+
+         To compile this driver as a module, choose M here: the
+         module will be called sis_i2c.
+
  config TOUCHSCREEN_ST1232
         tristate "Sitronix ST1232 touchscreen controllers"
         depends on I2C
diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile

index 3315882905f7a74fffa83846133ff8643642e4e1..b4373d6be4021687c1f4fc4b4bc0faa054141ec0 100644 (file)
--- a/drivers/input/touchscreen/Makefile
+++ b/drivers/input/touchscreen/Makefile
@@ -64,6 +64,8 @@ obj-$(CONFIG_TOUCHSCREEN_PENMOUNT)    += penmount.o
  obj-$(CONFIG_TOUCHSCREEN_PIXCIR)       += pixcir_i2c_ts.o
  obj-$(CONFIG_TOUCHSCREEN_RM_TS)                += raydium_i2c_ts.o
  obj-$(CONFIG_TOUCHSCREEN_S3C2410)      += s3c2410_ts.o
+obj-$(CONFIG_TOUCHSCREEN_SILEAD)       += silead.o
+obj-$(CONFIG_TOUCHSCREEN_SIS_I2C)      += sis_i2c.o
  obj-$(CONFIG_TOUCHSCREEN_ST1232)       += st1232.o
  obj-$(CONFIG_TOUCHSCREEN_STMPE)                += stmpe-ts.o
  obj-$(CONFIG_TOUCHSCREEN_SUN4I)                += sun4i-ts.o
diff --git a/drivers/input/touchscreen/ili210x.c b/drivers/input/touchscreen/ili210x.c

index ddf694b9fffc42380889450005caaa538d2cf423..fe4848bd1f4c3ecc564a9eaa479f029f4e590c45 100644 (file)
--- a/drivers/input/touchscreen/ili210x.c
+++ b/drivers/input/touchscreen/ili210x.c
@@ -169,7 +169,7 @@ static ssize_t ili210x_calibrate(struct device *dev,
  
         return count;
  }
-static DEVICE_ATTR(calibrate, 0644, NULL, ili210x_calibrate);
+static DEVICE_ATTR(calibrate, S_IWUSR, NULL, ili210x_calibrate);
  
  static struct attribute *ili210x_attributes[] = {
         &dev_attr_calibrate.attr,
diff --git a/drivers/input/touchscreen/silead.c b/drivers/input/touchscreen/silead.c

new file mode 100644 (file)

index 0000000..7379fe1
--- /dev/null
+++ b/drivers/input/touchscreen/silead.c
@@ -0,0 +1,565 @@
+/* -------------------------------------------------------------------------
+ * Copyright (C) 2014-2015, Intel Corporation
+ *
+ * Derived from:
+ *  gslX68X.c
+ *  Copyright (C) 2010-2015, Shanghai Sileadinc Co.Ltd
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ * -------------------------------------------------------------------------
+ */
+
+#include <linux/i2c.h>
+#include <linux/module.h>
+#include <linux/acpi.h>
+#include <linux/interrupt.h>
+#include <linux/gpio/consumer.h>
+#include <linux/delay.h>
+#include <linux/firmware.h>
+#include <linux/input.h>
+#include <linux/input/mt.h>
+#include <linux/input/touchscreen.h>
+#include <linux/pm.h>
+#include <linux/irq.h>
+
+#include <asm/unaligned.h>
+
+#define SILEAD_TS_NAME         "silead_ts"
+
+#define SILEAD_REG_RESET       0xE0
+#define SILEAD_REG_DATA                0x80
+#define SILEAD_REG_TOUCH_NR    0x80
+#define SILEAD_REG_POWER       0xBC
+#define SILEAD_REG_CLOCK       0xE4
+#define SILEAD_REG_STATUS      0xB0
+#define SILEAD_REG_ID          0xFC
+#define SILEAD_REG_MEM_CHECK   0xB0
+
+#define SILEAD_STATUS_OK       0x5A5A5A5A
+#define SILEAD_TS_DATA_LEN     44
+#define SILEAD_CLOCK           0x04
+
+#define SILEAD_CMD_RESET       0x88
+#define SILEAD_CMD_START       0x00
+
+#define SILEAD_POINT_DATA_LEN  0x04
+#define SILEAD_POINT_Y_OFF      0x00
+#define SILEAD_POINT_Y_MSB_OFF 0x01
+#define SILEAD_POINT_X_OFF     0x02
+#define SILEAD_POINT_X_MSB_OFF 0x03
+#define SILEAD_TOUCH_ID_MASK   0xF0
+
+#define SILEAD_CMD_SLEEP_MIN   10000
+#define SILEAD_CMD_SLEEP_MAX   20000
+#define SILEAD_POWER_SLEEP     20
+#define SILEAD_STARTUP_SLEEP   30
+
+#define SILEAD_MAX_FINGERS     10
+
+enum silead_ts_power {
+       SILEAD_POWER_ON  = 1,
+       SILEAD_POWER_OFF = 0
+};
+
+struct silead_ts_data {
+       struct i2c_client *client;
+       struct gpio_desc *gpio_power;
+       struct input_dev *input;
+       char fw_name[64];
+       struct touchscreen_properties prop;
+       u32 max_fingers;
+       u32 chip_id;
+       struct input_mt_pos pos[SILEAD_MAX_FINGERS];
+       int slots[SILEAD_MAX_FINGERS];
+       int id[SILEAD_MAX_FINGERS];
+};
+
+struct silead_fw_data {
+       u32 offset;
+       u32 val;
+};
+
+static int silead_ts_request_input_dev(struct silead_ts_data *data)
+{
+       struct device *dev = &data->client->dev;
+       int error;
+
+       data->input = devm_input_allocate_device(dev);
+       if (!data->input) {
+               dev_err(dev,
+                       "Failed to allocate input device\n");
+               return -ENOMEM;
+       }
+
+       input_set_abs_params(data->input, ABS_MT_POSITION_X, 0, 4095, 0, 0);
+       input_set_abs_params(data->input, ABS_MT_POSITION_Y, 0, 4095, 0, 0);
+       touchscreen_parse_properties(data->input, true, &data->prop);
+
+       input_mt_init_slots(data->input, data->max_fingers,
+                           INPUT_MT_DIRECT | INPUT_MT_DROP_UNUSED |
+                           INPUT_MT_TRACK);
+
+       data->input->name = SILEAD_TS_NAME;
+       data->input->phys = "input/ts";
+       data->input->id.bustype = BUS_I2C;
+
+       error = input_register_device(data->input);
+       if (error) {
+               dev_err(dev, "Failed to register input device: %d\n", error);
+               return error;
+       }
+
+       return 0;
+}
+
+static void silead_ts_set_power(struct i2c_client *client,
+                               enum silead_ts_power state)
+{
+       struct silead_ts_data *data = i2c_get_clientdata(client);
+
+       if (data->gpio_power) {
+               gpiod_set_value_cansleep(data->gpio_power, state);
+               msleep(SILEAD_POWER_SLEEP);
+       }
+}
+
+static void silead_ts_read_data(struct i2c_client *client)
+{
+       struct silead_ts_data *data = i2c_get_clientdata(client);
+       struct input_dev *input = data->input;
+       struct device *dev = &client->dev;
+       u8 *bufp, buf[SILEAD_TS_DATA_LEN];
+       int touch_nr, error, i;
+
+       error = i2c_smbus_read_i2c_block_data(client, SILEAD_REG_DATA,
+                                             SILEAD_TS_DATA_LEN, buf);
+       if (error < 0) {
+               dev_err(dev, "Data read error %d\n", error);
+               return;
+       }
+
+       touch_nr = buf[0];
+       if (touch_nr > data->max_fingers) {
+               dev_warn(dev, "More touches reported then supported %d > %d\n",
+                        touch_nr, data->max_fingers);
+               touch_nr = data->max_fingers;
+       }
+
+       bufp = buf + SILEAD_POINT_DATA_LEN;
+       for (i = 0; i < touch_nr; i++, bufp += SILEAD_POINT_DATA_LEN) {
+               /* Bits 4-7 are the touch id */
+               data->id[i] = (bufp[SILEAD_POINT_X_MSB_OFF] &
+                              SILEAD_TOUCH_ID_MASK) >> 4;
+               touchscreen_set_mt_pos(&data->pos[i], &data->prop,
+                       get_unaligned_le16(&bufp[SILEAD_POINT_X_OFF]) & 0xfff,
+                       get_unaligned_le16(&bufp[SILEAD_POINT_Y_OFF]) & 0xfff);
+       }
+
+       input_mt_assign_slots(input, data->slots, data->pos, touch_nr, 0);
+
+       for (i = 0; i < touch_nr; i++) {
+               input_mt_slot(input, data->slots[i]);
+               input_mt_report_slot_state(input, MT_TOOL_FINGER, true);
+               input_report_abs(input, ABS_MT_POSITION_X, data->pos[i].x);
+               input_report_abs(input, ABS_MT_POSITION_Y, data->pos[i].y);
+
+               dev_dbg(dev, "x=%d y=%d hw_id=%d sw_id=%d\n", data->pos[i].x,
+                       data->pos[i].y, data->id[i], data->slots[i]);
+       }
+
+       input_mt_sync_frame(input);
+       input_sync(input);
+}
+
+static int silead_ts_init(struct i2c_client *client)
+{
+       struct silead_ts_data *data = i2c_get_clientdata(client);
+       int error;
+
+       error = i2c_smbus_write_byte_data(client, SILEAD_REG_RESET,
+                                         SILEAD_CMD_RESET);
+       if (error)
+               goto i2c_write_err;
+       usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX);
+
+       error = i2c_smbus_write_byte_data(client, SILEAD_REG_TOUCH_NR,
+                                       data->max_fingers);
+       if (error)
+               goto i2c_write_err;
+       usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX);
+
+       error = i2c_smbus_write_byte_data(client, SILEAD_REG_CLOCK,
+                                         SILEAD_CLOCK);
+       if (error)
+               goto i2c_write_err;
+       usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX);
+
+       error = i2c_smbus_write_byte_data(client, SILEAD_REG_RESET,
+                                         SILEAD_CMD_START);
+       if (error)
+               goto i2c_write_err;
+       usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX);
+
+       return 0;
+
+i2c_write_err:
+       dev_err(&client->dev, "Registers clear error %d\n", error);
+       return error;
+}
+
+static int silead_ts_reset(struct i2c_client *client)
+{
+       int error;
+
+       error = i2c_smbus_write_byte_data(client, SILEAD_REG_RESET,
+                                         SILEAD_CMD_RESET);
+       if (error)
+               goto i2c_write_err;
+       usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX);
+
+       error = i2c_smbus_write_byte_data(client, SILEAD_REG_CLOCK,
+                                         SILEAD_CLOCK);
+       if (error)
+               goto i2c_write_err;
+       usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX);
+
+       error = i2c_smbus_write_byte_data(client, SILEAD_REG_POWER,
+                                         SILEAD_CMD_START);
+       if (error)
+               goto i2c_write_err;
+       usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX);
+
+       return 0;
+
+i2c_write_err:
+       dev_err(&client->dev, "Chip reset error %d\n", error);
+       return error;
+}
+
+static int silead_ts_startup(struct i2c_client *client)
+{
+       int error;
+
+       error = i2c_smbus_write_byte_data(client, SILEAD_REG_RESET, 0x00);
+       if (error) {
+               dev_err(&client->dev, "Startup error %d\n", error);
+               return error;
+       }
+
+       msleep(SILEAD_STARTUP_SLEEP);
+
+       return 0;
+}
+
+static int silead_ts_load_fw(struct i2c_client *client)
+{
+       struct device *dev = &client->dev;
+       struct silead_ts_data *data = i2c_get_clientdata(client);
+       unsigned int fw_size, i;
+       const struct firmware *fw;
+       struct silead_fw_data *fw_data;
+       int error;
+
+       dev_dbg(dev, "Firmware file name: %s", data->fw_name);
+
+       error = request_firmware(&fw, data->fw_name, dev);
+       if (error) {
+               dev_err(dev, "Firmware request error %d\n", error);
+               return error;
+       }
+
+       fw_size = fw->size / sizeof(*fw_data);
+       fw_data = (struct silead_fw_data *)fw->data;
+
+       for (i = 0; i < fw_size; i++) {
+               error = i2c_smbus_write_i2c_block_data(client,
+                                                      fw_data[i].offset,
+                                                      4,
+                                                      (u8 *)&fw_data[i].val);
+               if (error) {
+                       dev_err(dev, "Firmware load error %d\n", error);
+                       break;
+               }
+       }
+
+       release_firmware(fw);
+       return error ?: 0;
+}
+
+static u32 silead_ts_get_status(struct i2c_client *client)
+{
+       int error;
+       __le32 status;
+
+       error = i2c_smbus_read_i2c_block_data(client, SILEAD_REG_STATUS,
+                                             sizeof(status), (u8 *)&status);
+       if (error < 0) {
+               dev_err(&client->dev, "Status read error %d\n", error);
+               return error;
+       }
+
+       return le32_to_cpu(status);
+}
+
+static int silead_ts_get_id(struct i2c_client *client)
+{
+       struct silead_ts_data *data = i2c_get_clientdata(client);
+       __le32 chip_id;
+       int error;
+
+       error = i2c_smbus_read_i2c_block_data(client, SILEAD_REG_ID,
+                                             sizeof(chip_id), (u8 *)&chip_id);
+       if (error < 0) {
+               dev_err(&client->dev, "Chip ID read error %d\n", error);
+               return error;
+       }
+
+       data->chip_id = le32_to_cpu(chip_id);
+       dev_info(&client->dev, "Silead chip ID: 0x%8X", data->chip_id);
+
+       return 0;
+}
+
+static int silead_ts_setup(struct i2c_client *client)
+{
+       int error;
+       u32 status;
+
+       silead_ts_set_power(client, SILEAD_POWER_OFF);
+       silead_ts_set_power(client, SILEAD_POWER_ON);
+
+       error = silead_ts_get_id(client);
+       if (error)
+               return error;
+
+       error = silead_ts_init(client);
+       if (error)
+               return error;
+
+       error = silead_ts_reset(client);
+       if (error)
+               return error;
+
+       error = silead_ts_load_fw(client);
+       if (error)
+               return error;
+
+       error = silead_ts_startup(client);
+       if (error)
+               return error;
+
+       status = silead_ts_get_status(client);
+       if (status != SILEAD_STATUS_OK) {
+               dev_err(&client->dev,
+                       "Initialization error, status: 0x%X\n", status);
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+static irqreturn_t silead_ts_threaded_irq_handler(int irq, void *id)
+{
+       struct silead_ts_data *data = id;
+       struct i2c_client *client = data->client;
+
+       silead_ts_read_data(client);
+
+       return IRQ_HANDLED;
+}
+
+static void silead_ts_read_props(struct i2c_client *client)
+{
+       struct silead_ts_data *data = i2c_get_clientdata(client);
+       struct device *dev = &client->dev;
+       const char *str;
+       int error;
+
+       error = device_property_read_u32(dev, "silead,max-fingers",
+                                        &data->max_fingers);
+       if (error) {
+               dev_dbg(dev, "Max fingers read error %d\n", error);
+               data->max_fingers = 5; /* Most devices handle up-to 5 fingers */
+       }
+
+       error = device_property_read_string(dev, "touchscreen-fw-name", &str);
+       if (!error)
+               snprintf(data->fw_name, sizeof(data->fw_name), "%s", str);
+       else
+               dev_dbg(dev, "Firmware file name read error. Using default.");
+}
+
+#ifdef CONFIG_ACPI
+static int silead_ts_set_default_fw_name(struct silead_ts_data *data,
+                                        const struct i2c_device_id *id)
+{
+       const struct acpi_device_id *acpi_id;
+       struct device *dev = &data->client->dev;
+       int i;
+
+       if (ACPI_HANDLE(dev)) {
+               acpi_id = acpi_match_device(dev->driver->acpi_match_table, dev);
+               if (!acpi_id)
+                       return -ENODEV;
+
+               snprintf(data->fw_name, sizeof(data->fw_name), "%s.fw",
+                       acpi_id->id);
+
+               for (i = 0; i < strlen(data->fw_name); i++)
+                       data->fw_name[i] = tolower(data->fw_name[i]);
+       } else {
+               snprintf(data->fw_name, sizeof(data->fw_name), "%s.fw",
+                       id->name);
+       }
+
+       return 0;
+}
+#else
+static int silead_ts_set_default_fw_name(struct silead_ts_data *data,
+                                        const struct i2c_device_id *id)
+{
+       snprintf(data->fw_name, sizeof(data->fw_name), "%s.fw", id->name);
+       return 0;
+}
+#endif
+
+static int silead_ts_probe(struct i2c_client *client,
+                          const struct i2c_device_id *id)
+{
+       struct silead_ts_data *data;
+       struct device *dev = &client->dev;
+       int error;
+
+       if (!i2c_check_functionality(client->adapter,
+                                    I2C_FUNC_I2C |
+                                    I2C_FUNC_SMBUS_READ_I2C_BLOCK |
+                                    I2C_FUNC_SMBUS_WRITE_I2C_BLOCK)) {
+               dev_err(dev, "I2C functionality check failed\n");
+               return -ENXIO;
+       }
+
+       data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       i2c_set_clientdata(client, data);
+       data->client = client;
+
+       error = silead_ts_set_default_fw_name(data, id);
+       if (error)
+               return error;
+
+       silead_ts_read_props(client);
+
+       /* We must have the IRQ provided by DT or ACPI subsytem */
+       if (client->irq <= 0)
+               return -ENODEV;
+
+       /* Power GPIO pin */
+       data->gpio_power = gpiod_get_optional(dev, "power", GPIOD_OUT_LOW);
+       if (IS_ERR(data->gpio_power)) {
+               if (PTR_ERR(data->gpio_power) != -EPROBE_DEFER)
+                       dev_err(dev, "Shutdown GPIO request failed\n");
+               return PTR_ERR(data->gpio_power);
+       }
+
+       error = silead_ts_setup(client);
+       if (error)
+               return error;
+
+       error = silead_ts_request_input_dev(data);
+       if (error)
+               return error;
+
+       error = devm_request_threaded_irq(dev, client->irq,
+                                         NULL, silead_ts_threaded_irq_handler,
+                                         IRQF_ONESHOT, client->name, data);
+       if (error) {
+               if (error != -EPROBE_DEFER)
+                       dev_err(dev, "IRQ request failed %d\n", error);
+               return error;
+       }
+
+       return 0;
+}
+
+static int __maybe_unused silead_ts_suspend(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+
+       silead_ts_set_power(client, SILEAD_POWER_OFF);
+       return 0;
+}
+
+static int __maybe_unused silead_ts_resume(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       int error, status;
+
+       silead_ts_set_power(client, SILEAD_POWER_ON);
+
+       error = silead_ts_reset(client);
+       if (error)
+               return error;
+
+       error = silead_ts_startup(client);
+       if (error)
+               return error;
+
+       status = silead_ts_get_status(client);
+       if (status != SILEAD_STATUS_OK) {
+               dev_err(dev, "Resume error, status: 0x%02x\n", status);
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(silead_ts_pm, silead_ts_suspend, silead_ts_resume);
+
+static const struct i2c_device_id silead_ts_id[] = {
+       { "gsl1680", 0 },
+       { "gsl1688", 0 },
+       { "gsl3670", 0 },
+       { "gsl3675", 0 },
+       { "gsl3692", 0 },
+       { "mssl1680", 0 },
+       { }
+};
+MODULE_DEVICE_TABLE(i2c, silead_ts_id);
+
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id silead_ts_acpi_match[] = {
+       { "GSL1680", 0 },
+       { "GSL1688", 0 },
+       { "GSL3670", 0 },
+       { "GSL3675", 0 },
+       { "GSL3692", 0 },
+       { "MSSL1680", 0 },
+       { }
+};
+MODULE_DEVICE_TABLE(acpi, silead_ts_acpi_match);
+#endif
+
+static struct i2c_driver silead_ts_driver = {
+       .probe = silead_ts_probe,
+       .id_table = silead_ts_id,
+       .driver = {
+               .name = SILEAD_TS_NAME,
+               .acpi_match_table = ACPI_PTR(silead_ts_acpi_match),
+               .pm = &silead_ts_pm,
+       },
+};
+module_i2c_driver(silead_ts_driver);
+
+MODULE_AUTHOR("Robert Dolca <robert.dolca@intel.com>");
+MODULE_DESCRIPTION("Silead I2C touchscreen driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/input/touchscreen/sis_i2c.c b/drivers/input/touchscreen/sis_i2c.c

new file mode 100644 (file)

index 0000000..8d93f8c
--- /dev/null
+++ b/drivers/input/touchscreen/sis_i2c.c
@@ -0,0 +1,413 @@
+/*
+ * Touch Screen driver for SiS 9200 family I2C Touch panels
+ *
+ * Copyright (C) 2015 SiS, Inc.
+ * Copyright (C) 2016 Nextfour Group
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/crc-itu-t.h>
+#include <linux/delay.h>
+#include <linux/i2c.h>
+#include <linux/input.h>
+#include <linux/input/mt.h>
+#include <linux/interrupt.h>
+#include <linux/gpio/consumer.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+#define SIS_I2C_NAME           "sis_i2c_ts"
+
+/*
+ * The I2C packet format:
+ * le16                byte count
+ * u8          Report ID
+ * <contact data - variable length>
+ * u8          Number of contacts
+ * le16                Scan Time (optional)
+ * le16                CRC
+ *
+ * One touch point information consists of 6+ bytes, the order is:
+ * u8          contact state
+ * u8          finger id
+ * le16                x axis
+ * le16                y axis
+ * u8          contact width (optional)
+ * u8          contact height (optional)
+ * u8          pressure (optional)
+ *
+ * Maximum amount of data transmitted in one shot is 64 bytes, if controller
+ * needs to report more contacts than fit in one packet it will send true
+ * number of contacts in first packet and 0 as number of contacts in second
+ * packet.
+ */
+
+#define SIS_MAX_PACKET_SIZE            64
+
+#define SIS_PKT_LEN_OFFSET             0
+#define SIS_PKT_REPORT_OFFSET          2 /* Report ID/type */
+#define SIS_PKT_CONTACT_OFFSET         3 /* First contact */
+
+#define SIS_SCAN_TIME_LEN              2
+
+/* Supported report types */
+#define SIS_ALL_IN_ONE_PACKAGE         0x10
+#define SIS_PKT_IS_TOUCH(x)            (((x) & 0x0f) == 0x01)
+#define SIS_PKT_IS_HIDI2C(x)           (((x) & 0x0f) == 0x06)
+
+/* Contact properties within report */
+#define SIS_PKT_HAS_AREA(x)            ((x) & BIT(4))
+#define SIS_PKT_HAS_PRESSURE(x)                ((x) & BIT(5))
+#define SIS_PKT_HAS_SCANTIME(x)                ((x) & BIT(6))
+
+/* Contact size */
+#define SIS_BASE_LEN_PER_CONTACT       6
+#define SIS_AREA_LEN_PER_CONTACT       2
+#define SIS_PRESSURE_LEN_PER_CONTACT   1
+
+/* Offsets within contact data */
+#define SIS_CONTACT_STATUS_OFFSET      0
+#define SIS_CONTACT_ID_OFFSET          1 /* Contact ID */
+#define SIS_CONTACT_X_OFFSET           2
+#define SIS_CONTACT_Y_OFFSET           4
+#define SIS_CONTACT_WIDTH_OFFSET       6
+#define SIS_CONTACT_HEIGHT_OFFSET      7
+#define SIS_CONTACT_PRESSURE_OFFSET(id)        (SIS_PKT_HAS_AREA(id) ? 8 : 6)
+
+/* Individual contact state */
+#define SIS_STATUS_UP                  0x0
+#define SIS_STATUS_DOWN                        0x3
+
+/* Touchscreen parameters */
+#define SIS_MAX_FINGERS                        10
+#define SIS_MAX_X                      4095
+#define SIS_MAX_Y                      4095
+#define SIS_MAX_PRESSURE               255
+
+/* Resolution diagonal */
+#define SIS_AREA_LENGTH_LONGER         5792
+/*((SIS_MAX_X^2) + (SIS_MAX_Y^2))^0.5*/
+#define SIS_AREA_LENGTH_SHORT          5792
+#define SIS_AREA_UNIT                  (5792 / 32)
+
+struct sis_ts_data {
+       struct i2c_client *client;
+       struct input_dev *input;
+
+       struct gpio_desc *attn_gpio;
+       struct gpio_desc *reset_gpio;
+
+       u8 packet[SIS_MAX_PACKET_SIZE];
+};
+
+static int sis_read_packet(struct i2c_client *client, u8 *buf,
+                          unsigned int *num_contacts,
+                          unsigned int *contact_size)
+{
+       int count_idx;
+       int ret;
+       u16 len;
+       u16 crc, pkg_crc;
+       u8 report_id;
+
+       ret = i2c_master_recv(client, buf, SIS_MAX_PACKET_SIZE);
+       if (ret <= 0)
+               return -EIO;
+
+       len = get_unaligned_le16(&buf[SIS_PKT_LEN_OFFSET]);
+       if (len > SIS_MAX_PACKET_SIZE) {
+               dev_err(&client->dev,
+                       "%s: invalid packet length (%d vs %d)\n",
+                       __func__, len, SIS_MAX_PACKET_SIZE);
+               return -E2BIG;
+       }
+
+       if (len < 10)
+               return -EINVAL;
+
+       report_id = buf[SIS_PKT_REPORT_OFFSET];
+       count_idx  = len - 1;
+       *contact_size = SIS_BASE_LEN_PER_CONTACT;
+
+       if (report_id != SIS_ALL_IN_ONE_PACKAGE) {
+               if (SIS_PKT_IS_TOUCH(report_id)) {
+                       /*
+                        * Calculate CRC ignoring packet length
+                        * in the beginning and CRC transmitted
+                        * at the end of the packet.
+                        */
+                       crc = crc_itu_t(0, buf + 2, len - 2 - 2);
+                       pkg_crc = get_unaligned_le16(&buf[len - 2]);
+
+                       if (crc != pkg_crc) {
+                               dev_err(&client->dev,
+                                       "%s: CRC Error (%d vs %d)\n",
+                                       __func__, crc, pkg_crc);
+                               return -EINVAL;
+                       }
+
+                       count_idx -= 2;
+
+               } else if (!SIS_PKT_IS_HIDI2C(report_id)) {
+                       dev_err(&client->dev,
+                               "%s: invalid packet ID %#02x\n",
+                               __func__, report_id);
+                       return -EINVAL;
+               }
+
+               if (SIS_PKT_HAS_SCANTIME(report_id))
+                       count_idx -= SIS_SCAN_TIME_LEN;
+
+               if (SIS_PKT_HAS_AREA(report_id))
+                       *contact_size += SIS_AREA_LEN_PER_CONTACT;
+               if (SIS_PKT_HAS_PRESSURE(report_id))
+                       *contact_size += SIS_PRESSURE_LEN_PER_CONTACT;
+       }
+
+       *num_contacts = buf[count_idx];
+       return 0;
+}
+
+static int sis_ts_report_contact(struct sis_ts_data *ts, const u8 *data, u8 id)
+{
+       struct input_dev *input = ts->input;
+       int slot;
+       u8 status = data[SIS_CONTACT_STATUS_OFFSET];
+       u8 pressure;
+       u8 height, width;
+       u16 x, y;
+
+       if (status != SIS_STATUS_DOWN && status != SIS_STATUS_UP) {
+               dev_err(&ts->client->dev, "Unexpected touch status: %#02x\n",
+                       data[SIS_CONTACT_STATUS_OFFSET]);
+               return -EINVAL;
+       }
+
+       slot = input_mt_get_slot_by_key(input, data[SIS_CONTACT_ID_OFFSET]);
+       if (slot < 0)
+               return -ENOENT;
+
+       input_mt_slot(input, slot);
+       input_mt_report_slot_state(input, MT_TOOL_FINGER,
+                                  status == SIS_STATUS_DOWN);
+
+       if (status == SIS_STATUS_DOWN) {
+               pressure = height = width = 1;
+               if (id != SIS_ALL_IN_ONE_PACKAGE) {
+                       if (SIS_PKT_HAS_AREA(id)) {
+                               width = data[SIS_CONTACT_WIDTH_OFFSET];
+                               height = data[SIS_CONTACT_HEIGHT_OFFSET];
+                       }
+
+                       if (SIS_PKT_HAS_PRESSURE(id))
+                               pressure =
+                                       data[SIS_CONTACT_PRESSURE_OFFSET(id)];
+               }
+
+               x = get_unaligned_le16(&data[SIS_CONTACT_X_OFFSET]);
+               y = get_unaligned_le16(&data[SIS_CONTACT_Y_OFFSET]);
+
+               input_report_abs(input, ABS_MT_TOUCH_MAJOR,
+                                width * SIS_AREA_UNIT);
+               input_report_abs(input, ABS_MT_TOUCH_MINOR,
+                                height * SIS_AREA_UNIT);
+               input_report_abs(input, ABS_MT_PRESSURE, pressure);
+               input_report_abs(input, ABS_MT_POSITION_X, x);
+               input_report_abs(input, ABS_MT_POSITION_Y, y);
+       }
+
+       return 0;
+}
+
+static void sis_ts_handle_packet(struct sis_ts_data *ts)
+{
+       const u8 *contact;
+       unsigned int num_to_report = 0;
+       unsigned int num_contacts;
+       unsigned int num_reported;
+       unsigned int contact_size;
+       int error;
+       u8 report_id;
+
+       do {
+               error = sis_read_packet(ts->client, ts->packet,
+                                       &num_contacts, &contact_size);
+               if (error)
+                       break;
+
+               if (num_to_report == 0) {
+                       num_to_report = num_contacts;
+               } else if (num_contacts != 0) {
+                       dev_err(&ts->client->dev,
+                               "%s: nonzero (%d) point count in tail packet\n",
+                               __func__, num_contacts);
+                       break;
+               }
+
+               report_id = ts->packet[SIS_PKT_REPORT_OFFSET];
+               contact = &ts->packet[SIS_PKT_CONTACT_OFFSET];
+               num_reported = 0;
+
+               while (num_to_report > 0) {
+                       error = sis_ts_report_contact(ts, contact, report_id);
+                       if (error)
+                               break;
+
+                       contact += contact_size;
+                       num_to_report--;
+                       num_reported++;
+
+                       if (report_id != SIS_ALL_IN_ONE_PACKAGE &&
+                           num_reported >= 5) {
+                               /*
+                                * The remainder of contacts is sent
+                                * in the 2nd packet.
+                                */
+                               break;
+                       }
+               }
+       } while (num_to_report > 0);
+
+       input_mt_sync_frame(ts->input);
+       input_sync(ts->input);
+}
+
+static irqreturn_t sis_ts_irq_handler(int irq, void *dev_id)
+{
+       struct sis_ts_data *ts = dev_id;
+
+       do {
+               sis_ts_handle_packet(ts);
+       } while (ts->attn_gpio && gpiod_get_value_cansleep(ts->attn_gpio));
+
+       return IRQ_HANDLED;
+}
+
+static void sis_ts_reset(struct sis_ts_data *ts)
+{
+       if (ts->reset_gpio) {
+               /* Get out of reset */
+               usleep_range(1000, 2000);
+               gpiod_set_value(ts->reset_gpio, 1);
+               usleep_range(1000, 2000);
+               gpiod_set_value(ts->reset_gpio, 0);
+               msleep(100);
+       }
+}
+
+static int sis_ts_probe(struct i2c_client *client,
+                       const struct i2c_device_id *id)
+{
+       struct sis_ts_data *ts;
+       struct input_dev *input;
+       int error;
+
+       ts = devm_kzalloc(&client->dev, sizeof(*ts), GFP_KERNEL);
+       if (!ts)
+               return -ENOMEM;
+
+       ts->client = client;
+       i2c_set_clientdata(client, ts);
+
+       ts->attn_gpio = devm_gpiod_get_optional(&client->dev,
+                                               "attn", GPIOD_IN);
+       if (IS_ERR(ts->attn_gpio)) {
+               error = PTR_ERR(ts->attn_gpio);
+               if (error != -EPROBE_DEFER)
+                       dev_err(&client->dev,
+                               "Failed to get attention GPIO: %d\n", error);
+               return error;
+       }
+
+       ts->reset_gpio = devm_gpiod_get_optional(&client->dev,
+                                                "reset", GPIOD_OUT_LOW);
+       if (IS_ERR(ts->reset_gpio)) {
+               error = PTR_ERR(ts->reset_gpio);
+               if (error != -EPROBE_DEFER)
+                       dev_err(&client->dev,
+                               "Failed to get reset GPIO: %d\n", error);
+               return error;
+       }
+
+       sis_ts_reset(ts);
+
+       ts->input = input = devm_input_allocate_device(&client->dev);
+       if (!input) {
+               dev_err(&client->dev, "Failed to allocate input device\n");
+               return -ENOMEM;
+       }
+
+       input->name = "SiS Touchscreen";
+       input->id.bustype = BUS_I2C;
+
+       input_set_abs_params(input, ABS_MT_POSITION_X, 0, SIS_MAX_X, 0, 0);
+       input_set_abs_params(input, ABS_MT_POSITION_Y, 0, SIS_MAX_Y, 0, 0);
+       input_set_abs_params(input, ABS_MT_PRESSURE, 0, SIS_MAX_PRESSURE, 0, 0);
+       input_set_abs_params(input, ABS_MT_TOUCH_MAJOR,
+                            0, SIS_AREA_LENGTH_LONGER, 0, 0);
+       input_set_abs_params(input, ABS_MT_TOUCH_MINOR,
+                            0, SIS_AREA_LENGTH_SHORT, 0, 0);
+
+       error = input_mt_init_slots(input, SIS_MAX_FINGERS, INPUT_MT_DIRECT);
+       if (error) {
+               dev_err(&client->dev,
+                       "Failed to initialize MT slots: %d\n", error);
+               return error;
+       }
+
+       error = devm_request_threaded_irq(&client->dev, client->irq,
+                                         NULL, sis_ts_irq_handler,
+                                         IRQF_ONESHOT,
+                                         client->name, ts);
+       if (error) {
+               dev_err(&client->dev, "Failed to request IRQ: %d\n", error);
+               return error;
+       }
+
+       error = input_register_device(ts->input);
+       if (error) {
+               dev_err(&client->dev,
+                       "Failed to register input device: %d\n", error);
+               return error;
+       }
+
+       return 0;
+}
+
+#ifdef CONFIG_OF
+static const struct of_device_id sis_ts_dt_ids[] = {
+       { .compatible = "sis,9200-ts" },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, sis_ts_dt_ids);
+#endif
+
+static const struct i2c_device_id sis_ts_id[] = {
+       { SIS_I2C_NAME, 0 },
+       { "9200-ts",    0 },
+       { /* sentinel */  }
+};
+MODULE_DEVICE_TABLE(i2c, sis_ts_id);
+
+static struct i2c_driver sis_ts_driver = {
+       .driver = {
+               .name   = SIS_I2C_NAME,
+               .of_match_table = of_match_ptr(sis_ts_dt_ids),
+       },
+       .probe          = sis_ts_probe,
+       .id_table       = sis_ts_id,
+};
+module_i2c_driver(sis_ts_driver);
+
+MODULE_DESCRIPTION("SiS 9200 Family Touchscreen Driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Mika Penttilä <mika.penttila@nextfour.com>");
diff --git a/drivers/mtd/ubi/attach.c b/drivers/mtd/ubi/attach.c

index c1aaf0336cf2e389386b4a6f7c87a8609652b474..903becd3141051d23142baa75f52df1939a2bc94 100644 (file)
--- a/drivers/mtd/ubi/attach.c
+++ b/drivers/mtd/ubi/attach.c
@@ -174,6 +174,40 @@ static int add_corrupted(struct ubi_attach_info *ai, int pnum, int ec)
         return 0;
  }
  
+/**
+ * add_fastmap - add a Fastmap related physical eraseblock.
+ * @ai: attaching information
+ * @pnum: physical eraseblock number the VID header came from
+ * @vid_hdr: the volume identifier header
+ * @ec: erase counter of the physical eraseblock
+ *
+ * This function allocates a 'struct ubi_ainf_peb' object for a Fastamp
+ * physical eraseblock @pnum and adds it to the 'fastmap' list.
+ * Such blocks can be Fastmap super and data blocks from both the most
+ * recent Fastmap we're attaching from or from old Fastmaps which will
+ * be erased.
+ */
+static int add_fastmap(struct ubi_attach_info *ai, int pnum,
+                      struct ubi_vid_hdr *vid_hdr, int ec)
+{
+       struct ubi_ainf_peb *aeb;
+
+       aeb = kmem_cache_alloc(ai->aeb_slab_cache, GFP_KERNEL);
+       if (!aeb)
+               return -ENOMEM;
+
+       aeb->pnum = pnum;
+       aeb->vol_id = be32_to_cpu(vidh->vol_id);
+       aeb->sqnum = be64_to_cpu(vidh->sqnum);
+       aeb->ec = ec;
+       list_add(&aeb->u.list, &ai->fastmap);
+
+       dbg_bld("add to fastmap list: PEB %d, vol_id %d, sqnum: %llu", pnum,
+               aeb->vol_id, aeb->sqnum);
+
+       return 0;
+}
+
  /**
   * validate_vid_hdr - check volume identifier header.
   * @ubi: UBI device description object
@@ -803,13 +837,26 @@ out_unlock:
         return err;
  }
  
+static bool vol_ignored(int vol_id)
+{
+       switch (vol_id) {
+               case UBI_LAYOUT_VOLUME_ID:
+               return true;
+       }
+
+#ifdef CONFIG_MTD_UBI_FASTMAP
+       return ubi_is_fm_vol(vol_id);
+#else
+       return false;
+#endif
+}
+
  /**
   * scan_peb - scan and process UBI headers of a PEB.
   * @ubi: UBI device description object
   * @ai: attaching information
   * @pnum: the physical eraseblock number
- * @vid: The volume ID of the found volume will be stored in this pointer
- * @sqnum: The sqnum of the found volume will be stored in this pointer
+ * @fast: true if we're scanning for a Fastmap
   *
   * This function reads UBI headers of PEB @pnum, checks them, and adds
   * information about this PEB to the corresponding list or RB-tree in the
@@ -817,9 +864,9 @@ out_unlock:
   * successfully handled and a negative error code in case of failure.
   */
  static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai,
-                   int pnum, int *vid, unsigned long long *sqnum)
+                   int pnum, bool fast)
  {
-       long long uninitialized_var(ec);
+       long long ec;
         int err, bitflips = 0, vol_id = -1, ec_err = 0;
  
         dbg_bld("scan PEB %d", pnum);
@@ -935,6 +982,20 @@ static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai,
                          */
                         ai->maybe_bad_peb_count += 1;
         case UBI_IO_BAD_HDR:
+                       /*
+                        * If we're facing a bad VID header we have to drop *all*
+                        * Fastmap data structures we find. The most recent Fastmap
+                        * could be bad and therefore there is a chance that we attach
+                        * from an old one. On a fine MTD stack a PEB must not render
+                        * bad all of a sudden, but the reality is different.
+                        * So, let's be paranoid and help finding the root cause by
+                        * falling back to scanning mode instead of attaching with a
+                        * bad EBA table and cause data corruption which is hard to
+                        * analyze.
+                        */
+                       if (fast)
+                               ai->force_full_scan = 1;
+
                 if (ec_err)
                         /*
                          * Both headers are corrupted. There is a possibility
@@ -991,21 +1052,15 @@ static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai,
         }
  
         vol_id = be32_to_cpu(vidh->vol_id);
-       if (vid)
-               *vid = vol_id;
-       if (sqnum)
-               *sqnum = be64_to_cpu(vidh->sqnum);
-       if (vol_id > UBI_MAX_VOLUMES && vol_id != UBI_LAYOUT_VOLUME_ID) {
+       if (vol_id > UBI_MAX_VOLUMES && !vol_ignored(vol_id)) {
                 int lnum = be32_to_cpu(vidh->lnum);
  
                 /* Unsupported internal volume */
                 switch (vidh->compat) {
                 case UBI_COMPAT_DELETE:
-                       if (vol_id != UBI_FM_SB_VOLUME_ID
-                           && vol_id != UBI_FM_DATA_VOLUME_ID) {
-                               ubi_msg(ubi, "\"delete\" compatible internal volume %d:%d found, will remove it",
-                                       vol_id, lnum);
-                       }
+                       ubi_msg(ubi, "\"delete\" compatible internal volume %d:%d found, will remove it",
+                               vol_id, lnum);
+
                         err = add_to_list(ai, pnum, vol_id, lnum,
                                           ec, 1, &ai->erase);
                         if (err)
@@ -1037,7 +1092,12 @@ static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai,
         if (ec_err)
                 ubi_warn(ubi, "valid VID header but corrupted EC header at PEB %d",
                          pnum);
-       err = ubi_add_to_av(ubi, ai, pnum, ec, vidh, bitflips);
+
+       if (ubi_is_fm_vol(vol_id))
+               err = add_fastmap(ai, pnum, vidh, ec);
+       else
+               err = ubi_add_to_av(ubi, ai, pnum, ec, vidh, bitflips);
+
         if (err)
                 return err;
  
@@ -1186,6 +1246,10 @@ static void destroy_ai(struct ubi_attach_info *ai)
                 list_del(&aeb->u.list);
                 kmem_cache_free(ai->aeb_slab_cache, aeb);
         }
+       list_for_each_entry_safe(aeb, aeb_tmp, &ai->fastmap, u.list) {
+               list_del(&aeb->u.list);
+               kmem_cache_free(ai->aeb_slab_cache, aeb);
+       }
  
         /* Destroy the volume RB-tree */
         rb = ai->volumes.rb_node;
@@ -1245,7 +1309,7 @@ static int scan_all(struct ubi_device *ubi, struct ubi_attach_info *ai,
                 cond_resched();
  
                 dbg_gen("process PEB %d", pnum);
-               err = scan_peb(ubi, ai, pnum, NULL, NULL);
+               err = scan_peb(ubi, ai, pnum, false);
                 if (err < 0)
                         goto out_vidh;
         }
@@ -1311,6 +1375,7 @@ static struct ubi_attach_info *alloc_ai(void)
         INIT_LIST_HEAD(&ai->free);
         INIT_LIST_HEAD(&ai->erase);
         INIT_LIST_HEAD(&ai->alien);
+       INIT_LIST_HEAD(&ai->fastmap);
         ai->volumes = RB_ROOT;
         ai->aeb_slab_cache = kmem_cache_create("ubi_aeb_slab_cache",
                                                sizeof(struct ubi_ainf_peb),
@@ -1326,7 +1391,7 @@ static struct ubi_attach_info *alloc_ai(void)
  #ifdef CONFIG_MTD_UBI_FASTMAP
  
  /**
- * scan_fastmap - try to find a fastmap and attach from it.
+ * scan_fast - try to find a fastmap and attach from it.
   * @ubi: UBI device description object
   * @ai: attach info object
   *
@@ -1337,52 +1402,58 @@ static struct ubi_attach_info *alloc_ai(void)
   */
  static int scan_fast(struct ubi_device *ubi, struct ubi_attach_info **ai)
  {
-       int err, pnum, fm_anchor = -1;
-       unsigned long long max_sqnum = 0;
+       int err, pnum;
+       struct ubi_attach_info *scan_ai;
  
         err = -ENOMEM;
  
+       scan_ai = alloc_ai();
+       if (!scan_ai)
+               goto out;
+
         ech = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);
         if (!ech)
-               goto out;
+               goto out_ai;
  
         vidh = ubi_zalloc_vid_hdr(ubi, GFP_KERNEL);
         if (!vidh)
                 goto out_ech;
  
         for (pnum = 0; pnum < UBI_FM_MAX_START; pnum++) {
-               int vol_id = -1;
-               unsigned long long sqnum = -1;
                 cond_resched();
  
                 dbg_gen("process PEB %d", pnum);
-               err = scan_peb(ubi, *ai, pnum, &vol_id, &sqnum);
+               err = scan_peb(ubi, scan_ai, pnum, true);
                 if (err < 0)
                         goto out_vidh;
-
-               if (vol_id == UBI_FM_SB_VOLUME_ID && sqnum > max_sqnum) {
-                       max_sqnum = sqnum;
-                       fm_anchor = pnum;
-               }
         }
  
         ubi_free_vid_hdr(ubi, vidh);
         kfree(ech);
  
-       if (fm_anchor < 0)
-               return UBI_NO_FASTMAP;
+       if (scan_ai->force_full_scan)
+               err = UBI_NO_FASTMAP;
+       else
+               err = ubi_scan_fastmap(ubi, *ai, scan_ai);
  
-       destroy_ai(*ai);
-       *ai = alloc_ai();
-       if (!*ai)
-               return -ENOMEM;
+       if (err) {
+               /*
+                * Didn't attach via fastmap, do a full scan but reuse what
+                * we've aready scanned.
+                */
+               destroy_ai(*ai);
+               *ai = scan_ai;
+       } else
+               destroy_ai(scan_ai);
  
-       return ubi_scan_fastmap(ubi, *ai, fm_anchor);
+       return err;
  
  out_vidh:
         ubi_free_vid_hdr(ubi, vidh);
  out_ech:
         kfree(ech);
+out_ai:
+       destroy_ai(scan_ai);
  out:
         return err;
  }
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c

index ef3618299494f4973fd528ae72325490eb622b2c..0680516bb4728007871d0064b987d73141fa9aed 100644 (file)
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -874,7 +874,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num,
         for (i = 0; i < UBI_MAX_DEVICES; i++) {
                 ubi = ubi_devices[i];
                 if (ubi && mtd->index == ubi->mtd->index) {
-                       ubi_err(ubi, "mtd%d is already attached to ubi%d",
+                       pr_err("ubi: mtd%d is already attached to ubi%d",
                                 mtd->index, i);
                         return -EEXIST;
                 }
@@ -889,7 +889,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num,
          * no sense to attach emulated MTD devices, so we prohibit this.
          */
         if (mtd->type == MTD_UBIVOLUME) {
-               ubi_err(ubi, "refuse attaching mtd%d - it is already emulated on top of UBI",
+               pr_err("ubi: refuse attaching mtd%d - it is already emulated on top of UBI",
                         mtd->index);
                 return -EINVAL;
         }
@@ -900,7 +900,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num,
                         if (!ubi_devices[ubi_num])
                                 break;
                 if (ubi_num == UBI_MAX_DEVICES) {
-                       ubi_err(ubi, "only %d UBI devices may be created",
+                       pr_err("ubi: only %d UBI devices may be created",
                                 UBI_MAX_DEVICES);
                         return -ENFILE;
                 }
@@ -910,7 +910,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num,
  
                 /* Make sure ubi_num is not busy */
                 if (ubi_devices[ubi_num]) {
-                       ubi_err(ubi, "already exists");
+                       pr_err("ubi: ubi%i already exists", ubi_num);
                         return -EEXIST;
                 }
         }
@@ -992,6 +992,9 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num,
                         goto out_detach;
         }
  
+       /* Make device "available" before it becomes accessible via sysfs */
+       ubi_devices[ubi_num] = ubi;
+
         err = uif_init(ubi, &ref);
         if (err)
                 goto out_detach;
@@ -1036,7 +1039,6 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num,
         wake_up_process(ubi->bgt_thread);
         spin_unlock(&ubi->wl_lock);
  
-       ubi_devices[ubi_num] = ubi;
         ubi_notify_all(ubi, UBI_VOLUME_ADDED, NULL);
         return ubi_num;
  
@@ -1047,6 +1049,7 @@ out_uif:
         ubi_assert(ref);
         uif_close(ubi);
  out_detach:
+       ubi_devices[ubi_num] = NULL;
         ubi_wl_close(ubi);
         ubi_free_internal_volumes(ubi);
         vfree(ubi->vtbl);
diff --git a/drivers/mtd/ubi/fastmap.c b/drivers/mtd/ubi/fastmap.c

index 990898b9dc7289f881ecf357573bbc1d79bbf6ad..48eb55f344eb12b23e2f77c5736cffa816b3ded0 100644 (file)
--- a/drivers/mtd/ubi/fastmap.c
+++ b/drivers/mtd/ubi/fastmap.c
@@ -15,20 +15,22 @@
   */
  
  #include <linux/crc32.h>
+#include <linux/bitmap.h>
  #include "ubi.h"
  
  /**
   * init_seen - allocate memory for used for debugging.
   * @ubi: UBI device description object
   */
-static inline int *init_seen(struct ubi_device *ubi)
+static inline unsigned long *init_seen(struct ubi_device *ubi)
  {
-       int *ret;
+       unsigned long *ret;
  
         if (!ubi_dbg_chk_fastmap(ubi))
                 return NULL;
  
-       ret = kcalloc(ubi->peb_count, sizeof(int), GFP_KERNEL);
+       ret = kcalloc(BITS_TO_LONGS(ubi->peb_count), sizeof(unsigned long),
+                     GFP_KERNEL);
         if (!ret)
                 return ERR_PTR(-ENOMEM);
  
@@ -39,7 +41,7 @@ static inline int *init_seen(struct ubi_device *ubi)
   * free_seen - free the seen logic integer array.
   * @seen: integer array of @ubi->peb_count size
   */
-static inline void free_seen(int *seen)
+static inline void free_seen(unsigned long *seen)
  {
         kfree(seen);
  }
@@ -50,12 +52,12 @@ static inline void free_seen(int *seen)
   * @pnum: The PEB to be makred as seen
   * @seen: integer array of @ubi->peb_count size
   */
-static inline void set_seen(struct ubi_device *ubi, int pnum, int *seen)
+static inline void set_seen(struct ubi_device *ubi, int pnum, unsigned long *seen)
  {
         if (!ubi_dbg_chk_fastmap(ubi) || !seen)
                 return;
  
-       seen[pnum] = 1;
+       set_bit(pnum, seen);
  }
  
  /**
@@ -63,7 +65,7 @@ static inline void set_seen(struct ubi_device *ubi, int pnum, int *seen)
   * @ubi: UBI device description object
   * @seen: integer array of @ubi->peb_count size
   */
-static int self_check_seen(struct ubi_device *ubi, int *seen)
+static int self_check_seen(struct ubi_device *ubi, unsigned long *seen)
  {
         int pnum, ret = 0;
  
@@ -71,7 +73,7 @@ static int self_check_seen(struct ubi_device *ubi, int *seen)
                 return 0;
  
         for (pnum = 0; pnum < ubi->peb_count; pnum++) {
-               if (!seen[pnum] && ubi->lookuptbl[pnum]) {
+               if (test_bit(pnum, seen) && ubi->lookuptbl[pnum]) {
                         ubi_err(ubi, "self-check failed for PEB %d, fastmap didn't see it", pnum);
                         ret = -EINVAL;
                 }
@@ -578,7 +580,7 @@ static int count_fastmap_pebs(struct ubi_attach_info *ai)
         list_for_each_entry(aeb, &ai->free, u.list)
                 n++;
  
-        ubi_rb_for_each_entry(rb1, av, &ai->volumes, rb)
+       ubi_rb_for_each_entry(rb1, av, &ai->volumes, rb)
                 ubi_rb_for_each_entry(rb2, aeb, &av->root, u.rb)
                         n++;
  
@@ -849,28 +851,58 @@ fail:
         return ret;
  }
  
+/**
+ * find_fm_anchor - find the most recent Fastmap superblock (anchor)
+ * @ai: UBI attach info to be filled
+ */
+static int find_fm_anchor(struct ubi_attach_info *ai)
+{
+       int ret = -1;
+       struct ubi_ainf_peb *aeb;
+       unsigned long long max_sqnum = 0;
+
+       list_for_each_entry(aeb, &ai->fastmap, u.list) {
+               if (aeb->vol_id == UBI_FM_SB_VOLUME_ID && aeb->sqnum > max_sqnum) {
+                       max_sqnum = aeb->sqnum;
+                       ret = aeb->pnum;
+               }
+       }
+
+       return ret;
+}
+
  /**
   * ubi_scan_fastmap - scan the fastmap.
   * @ubi: UBI device object
   * @ai: UBI attach info to be filled
- * @fm_anchor: The fastmap starts at this PEB
+ * @scan_ai: UBI attach info from the first 64 PEBs,
+ *           used to find the most recent Fastmap data structure
   *
   * Returns 0 on success, UBI_NO_FASTMAP if no fastmap was found,
   * UBI_BAD_FASTMAP if one was found but is not usable.
   * < 0 indicates an internal error.
   */
  int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai,
-                    int fm_anchor)
+                    struct ubi_attach_info *scan_ai)
  {
         struct ubi_fm_sb *fmsb, *fmsb2;
         struct ubi_vid_hdr *vh;
         struct ubi_ec_hdr *ech;
         struct ubi_fastmap_layout *fm;
-       int i, used_blocks, pnum, ret = 0;
+       struct ubi_ainf_peb *tmp_aeb, *aeb;
+       int i, used_blocks, pnum, fm_anchor, ret = 0;
         size_t fm_size;
         __be32 crc, tmp_crc;
         unsigned long long sqnum = 0;
  
+       fm_anchor = find_fm_anchor(scan_ai);
+       if (fm_anchor < 0)
+               return UBI_NO_FASTMAP;
+
+       /* Move all (possible) fastmap blocks into our new attach structure. */
+       list_for_each_entry_safe(aeb, tmp_aeb, &scan_ai->fastmap, u.list)
+               list_move_tail(&aeb->u.list, &ai->fastmap);
+
         down_write(&ubi->fm_protect);
         memset(ubi->fm_buf, 0, ubi->fm_size);
  
@@ -945,6 +977,13 @@ int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai,
                         goto free_hdr;
                 }
  
+               if (i == 0 && pnum != fm_anchor) {
+                       ubi_err(ubi, "Fastmap anchor PEB mismatch: PEB: %i vs. %i",
+                               pnum, fm_anchor);
+                       ret = UBI_BAD_FASTMAP;
+                       goto free_hdr;
+               }
+
                 ret = ubi_io_read_ec_hdr(ubi, pnum, ech, 0);
                 if (ret && ret != UBI_IO_BITFLIPS) {
                         ubi_err(ubi, "unable to read fastmap block# %i EC (PEB: %i)",
@@ -1102,7 +1141,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
         struct rb_node *tmp_rb;
         int ret, i, j, free_peb_count, used_peb_count, vol_count;
         int scrub_peb_count, erase_peb_count;
-       int *seen_pebs = NULL;
+       unsigned long *seen_pebs = NULL;
  
         fm_raw = ubi->fm_buf;
         memset(ubi->fm_buf, 0, ubi->fm_size);
diff --git a/drivers/mtd/ubi/gluebi.c b/drivers/mtd/ubi/gluebi.c

index cb7c075f2144969d5f416d440060455ee2904933..1cb287ec32adbb486c3289c23a89191e7be48221 100644 (file)
--- a/drivers/mtd/ubi/gluebi.c
+++ b/drivers/mtd/ubi/gluebi.c
@@ -99,9 +99,6 @@ static int gluebi_get_device(struct mtd_info *mtd)
         struct gluebi_device *gluebi;
         int ubi_mode = UBI_READONLY;
  
-       if (!try_module_get(THIS_MODULE))
-               return -ENODEV;
-
         if (mtd->flags & MTD_WRITEABLE)
                 ubi_mode = UBI_READWRITE;
  
@@ -129,7 +126,6 @@ static int gluebi_get_device(struct mtd_info *mtd)
                                        ubi_mode);
         if (IS_ERR(gluebi->desc)) {
                 mutex_unlock(&devices_mutex);
-               module_put(THIS_MODULE);
                 return PTR_ERR(gluebi->desc);
         }
         gluebi->refcnt += 1;
@@ -153,7 +149,6 @@ static void gluebi_put_device(struct mtd_info *mtd)
         gluebi->refcnt -= 1;
         if (gluebi->refcnt == 0)
                 ubi_close_volume(gluebi->desc);
-       module_put(THIS_MODULE);
         mutex_unlock(&devices_mutex);
  }
  
diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c

index 10cf3b549959ce46f9c517b77fbcdd84e274a3d2..ff8cafe1e5cd1f1489ef41745f21f6c1110b61c5 100644 (file)
--- a/drivers/mtd/ubi/io.c
+++ b/drivers/mtd/ubi/io.c
@@ -1019,7 +1019,7 @@ int ubi_io_read_vid_hdr(struct ubi_device *ubi, int pnum,
  
         p = (char *)vid_hdr - ubi->vid_hdr_shift;
         read_err = ubi_io_read(ubi, p, pnum, ubi->vid_hdr_aloffset,
-                         ubi->vid_hdr_alsize);
+                         ubi->vid_hdr_shift + UBI_VID_HDR_SIZE);
         if (read_err && read_err != UBI_IO_BITFLIPS && !mtd_is_eccerr(read_err))
                 return read_err;
  
diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h

index 61d4e99755a4bef64aa7821727444911d2816d77..b616a115c9d38b9e8f2d5b3209629f1e98ab0501 100644 (file)
--- a/drivers/mtd/ubi/ubi.h
+++ b/drivers/mtd/ubi/ubi.h
@@ -703,6 +703,8 @@ struct ubi_ainf_volume {
   * @erase: list of physical eraseblocks which have to be erased
   * @alien: list of physical eraseblocks which should not be used by UBI (e.g.,
   *         those belonging to "preserve"-compatible internal volumes)
+ * @fastmap: list of physical eraseblocks which relate to fastmap (e.g.,
+ *           eraseblocks of the current and not yet erased old fastmap blocks)
   * @corr_peb_count: count of PEBs in the @corr list
   * @empty_peb_count: count of PEBs which are presumably empty (contain only
   *                   0xFF bytes)
@@ -713,6 +715,8 @@ struct ubi_ainf_volume {
   * @vols_found: number of volumes found
   * @highest_vol_id: highest volume ID
   * @is_empty: flag indicating whether the MTD device is empty or not
+ * @force_full_scan: flag indicating whether we need to do a full scan and drop
+                    all existing Fastmap data structures
   * @min_ec: lowest erase counter value
   * @max_ec: highest erase counter value
   * @max_sqnum: highest sequence number value
@@ -731,6 +735,7 @@ struct ubi_attach_info {
         struct list_head free;
         struct list_head erase;
         struct list_head alien;
+       struct list_head fastmap;
         int corr_peb_count;
         int empty_peb_count;
         int alien_peb_count;
@@ -739,6 +744,7 @@ struct ubi_attach_info {
         int vols_found;
         int highest_vol_id;
         int is_empty;
+       int force_full_scan;
         int min_ec;
         int max_ec;
         unsigned long long max_sqnum;
@@ -911,7 +917,7 @@ int ubi_compare_lebs(struct ubi_device *ubi, const struct ubi_ainf_peb *aeb,
  size_t ubi_calc_fm_size(struct ubi_device *ubi);
  int ubi_update_fastmap(struct ubi_device *ubi);
  int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai,
-                    int fm_anchor);
+                    struct ubi_attach_info *scan_ai);
  #else
  static inline int ubi_update_fastmap(struct ubi_device *ubi) { return 0; }
  #endif
@@ -1105,4 +1111,42 @@ static inline int idx2vol_id(const struct ubi_device *ubi, int idx)
                 return idx;
  }
  
+/**
+ * ubi_is_fm_vol - check whether a volume ID is a Fastmap volume.
+ * @vol_id: volume ID
+ */
+static inline bool ubi_is_fm_vol(int vol_id)
+{
+       switch (vol_id) {
+               case UBI_FM_SB_VOLUME_ID:
+               case UBI_FM_DATA_VOLUME_ID:
+               return true;
+       }
+
+       return false;
+}
+
+/**
+ * ubi_find_fm_block - check whether a PEB is part of the current Fastmap.
+ * @ubi: UBI device description object
+ * @pnum: physical eraseblock to look for
+ *
+ * This function returns a wear leveling object if @pnum relates to the current
+ * fastmap, @NULL otherwise.
+ */
+static inline struct ubi_wl_entry *ubi_find_fm_block(const struct ubi_device *ubi,
+                                                    int pnum)
+{
+       int i;
+
+       if (ubi->fm) {
+               for (i = 0; i < ubi->fm->used_blocks; i++) {
+                       if (ubi->fm->e[i]->pnum == pnum)
+                               return ubi->fm->e[i];
+               }
+       }
+
+       return NULL;
+}
+
  #endif /* !__UBI_UBI_H__ */
diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c

index 10059dfdc1b6db3e570348bd03ad41f968329340..0138f526474a25d0fa14bc5f8d375e74235287af 100644 (file)
--- a/drivers/mtd/ubi/vmt.c
+++ b/drivers/mtd/ubi/vmt.c
@@ -488,13 +488,6 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs)
                 spin_unlock(&ubi->volumes_lock);
         }
  
-       /* Change volume table record */
-       vtbl_rec = ubi->vtbl[vol_id];
-       vtbl_rec.reserved_pebs = cpu_to_be32(reserved_pebs);
-       err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
-       if (err)
-               goto out_acc;
-
         if (pebs < 0) {
                 for (i = 0; i < -pebs; i++) {
                         err = ubi_eba_unmap_leb(ubi, vol, reserved_pebs + i);
@@ -512,6 +505,24 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs)
                 spin_unlock(&ubi->volumes_lock);
         }
  
+       /*
+        * When we shrink a volume we have to flush all pending (erase) work.
+        * Otherwise it can happen that upon next attach UBI finds a LEB with
+        * lnum > highest_lnum and refuses to attach.
+        */
+       if (pebs < 0) {
+               err = ubi_wl_flush(ubi, vol_id, UBI_ALL);
+               if (err)
+                       goto out_acc;
+       }
+
+       /* Change volume table record */
+       vtbl_rec = ubi->vtbl[vol_id];
+       vtbl_rec.reserved_pebs = cpu_to_be32(reserved_pebs);
+       err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
+       if (err)
+               goto out_acc;
+
         vol->reserved_pebs = reserved_pebs;
         if (vol->vol_type == UBI_DYNAMIC_VOLUME) {
                 vol->used_ebs = reserved_pebs;
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c

index 959c7b12e0b1be4863960e43fa9dfbe67f2eb58a..f4533266d7b26cbac8ca44861d88277a5d0cbabc 100644 (file)
--- a/drivers/mtd/ubi/wl.c
+++ b/drivers/mtd/ubi/wl.c
@@ -1598,19 +1598,44 @@ int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai)
                 }
         }
  
-       dbg_wl("found %i PEBs", found_pebs);
+       list_for_each_entry(aeb, &ai->fastmap, u.list) {
+               cond_resched();
+
+               e = ubi_find_fm_block(ubi, aeb->pnum);
  
-       if (ubi->fm) {
-               ubi_assert(ubi->good_peb_count ==
-                          found_pebs + ubi->fm->used_blocks);
+               if (e) {
+                       ubi_assert(!ubi->lookuptbl[e->pnum]);
+                       ubi->lookuptbl[e->pnum] = e;
+               } else {
+                       /*
+                        * Usually old Fastmap PEBs are scheduled for erasure
+                        * and we don't have to care about them but if we face
+                        * an power cut before scheduling them we need to
+                        * take care of them here.
+                        */
+                       if (ubi->lookuptbl[aeb->pnum])
+                               continue;
+
+                       e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL);
+                       if (!e)
+                               goto out_free;
  
-               for (i = 0; i < ubi->fm->used_blocks; i++) {
-                       e = ubi->fm->e[i];
+                       e->pnum = aeb->pnum;
+                       e->ec = aeb->ec;
+                       ubi_assert(!ubi->lookuptbl[e->pnum]);
                         ubi->lookuptbl[e->pnum] = e;
+                       if (schedule_erase(ubi, e, aeb->vol_id, aeb->lnum, 0)) {
+                               wl_entry_destroy(ubi, e);
+                               goto out_free;
+                       }
                 }
+
+               found_pebs++;
         }
-       else
-               ubi_assert(ubi->good_peb_count == found_pebs);
+
+       dbg_wl("found %i PEBs", found_pebs);
+
+       ubi_assert(ubi->good_peb_count == found_pebs);
  
         reserved_pebs = WL_RESERVED_PEBS;
         ubi_fastmap_init(ubi, &reserved_pebs);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h

index 4705e2dea42342d870fff93ab3d070e08a5ac715..e0ebe1378cb22eeaf7dad5d8b85ac1ed0c5ca528 100644 (file)
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
@@ -104,6 +104,8 @@ enum {
  
  enum CPL_error {
         CPL_ERR_NONE               = 0,
+       CPL_ERR_TCAM_PARITY        = 1,
+       CPL_ERR_TCAM_MISS          = 2,
         CPL_ERR_TCAM_FULL          = 3,
         CPL_ERR_BAD_LENGTH         = 15,
         CPL_ERR_BAD_ROUTE          = 18,
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c

index f4497cf4d06dcfda02c3ea565f82a4280efbe14c..d728704d0c7b523c6dc90d35c10390d122439222 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -721,6 +721,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
  #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET         0x98
  #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET                0xa0
  #define QUERY_DEV_CAP_ETH_BACKPL_OFFSET                0x9c
+#define QUERY_DEV_CAP_DIAG_RPRT_PER_PORT       0x9c
  #define QUERY_DEV_CAP_FW_REASSIGN_MAC          0x9d
  #define QUERY_DEV_CAP_VXLAN                    0x9e
  #define QUERY_DEV_CAP_MAD_DEMUX_OFFSET         0xb0
@@ -935,6 +936,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
                 dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP;
         if (field32 & (1 << 7))
                 dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT;
+       MLX4_GET(field32, outbox, QUERY_DEV_CAP_DIAG_RPRT_PER_PORT);
+       if (field32 & (1 << 17))
+               dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT;
         MLX4_GET(field, outbox, QUERY_DEV_CAP_FW_REASSIGN_MAC);
         if (field & 1<<6)
                 dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN;
@@ -2457,6 +2461,42 @@ int mlx4_NOP(struct mlx4_dev *dev)
                         MLX4_CMD_NATIVE);
  }
  
+int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier,
+                            const u32 offset[],
+                            u32 value[], size_t array_len, u8 port)
+{
+       struct mlx4_cmd_mailbox *mailbox;
+       u32 *outbox;
+       size_t i;
+       int ret;
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox))
+               return PTR_ERR(mailbox);
+
+       outbox = mailbox->buf;
+
+       ret = mlx4_cmd_box(dev, 0, mailbox->dma, port, op_modifier,
+                          MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A,
+                          MLX4_CMD_NATIVE);
+       if (ret)
+               goto out;
+
+       for (i = 0; i < array_len; i++) {
+               if (offset[i] > MLX4_MAILBOX_SIZE) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               MLX4_GET(value[i], outbox, offset[i]);
+       }
+
+out:
+       mlx4_free_cmd_mailbox(dev, mailbox);
+       return ret;
+}
+EXPORT_SYMBOL(mlx4_query_diag_counters);
+
  int mlx4_get_phys_port_id(struct mlx4_dev *dev)
  {
         u8 port;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c

index 04bc522605a03bb9cb7e6602f9c003ccfc28433b..c07f4d01b70e55958530a7ee640633bc65f5f4de 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/srq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
@@ -63,12 +63,12 @@ void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type)
                 complete(&srq->free);
  }
  
-static int get_pas_size(void *srqc)
+static int get_pas_size(struct mlx5_srq_attr *in)
  {
-       u32 log_page_size = MLX5_GET(srqc, srqc, log_page_size) + 12;
-       u32 log_srq_size  = MLX5_GET(srqc, srqc, log_srq_size);
-       u32 log_rq_stride = MLX5_GET(srqc, srqc, log_rq_stride);
-       u32 page_offset   = MLX5_GET(srqc, srqc, page_offset);
+       u32 log_page_size = in->log_page_size + 12;
+       u32 log_srq_size  = in->log_size;
+       u32 log_rq_stride = in->wqe_shift;
+       u32 page_offset   = in->page_offset;
         u32 po_quanta     = 1 << (log_page_size - 6);
         u32 rq_sz         = 1 << (log_srq_size + 4 + log_rq_stride);
         u32 page_size     = 1 << log_page_size;
@@ -78,57 +78,58 @@ static int get_pas_size(void *srqc)
         return rq_num_pas * sizeof(u64);
  }
  
-static void rmpc_srqc_reformat(void *srqc, void *rmpc, bool srqc_to_rmpc)
+static void set_wq(void *wq, struct mlx5_srq_attr *in)
  {
-       void *wq = MLX5_ADDR_OF(rmpc, rmpc, wq);
-
-       if (srqc_to_rmpc) {
-               switch (MLX5_GET(srqc, srqc, state)) {
-               case MLX5_SRQC_STATE_GOOD:
-                       MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
-                       break;
-               case MLX5_SRQC_STATE_ERROR:
-                       MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_ERR);
-                       break;
-               default:
-                       pr_warn("%s: %d: Unknown srq state = 0x%x\n", __func__,
-                               __LINE__, MLX5_GET(srqc, srqc, state));
-                       MLX5_SET(rmpc, rmpc, state, MLX5_GET(srqc, srqc, state));
-               }
-
-               MLX5_SET(wq,   wq, wq_signature,  MLX5_GET(srqc,  srqc, wq_signature));
-               MLX5_SET(wq,   wq, log_wq_pg_sz,  MLX5_GET(srqc,  srqc, log_page_size));
-               MLX5_SET(wq,   wq, log_wq_stride, MLX5_GET(srqc,  srqc, log_rq_stride) + 4);
-               MLX5_SET(wq,   wq, log_wq_sz,     MLX5_GET(srqc,  srqc, log_srq_size));
-               MLX5_SET(wq,   wq, page_offset,   MLX5_GET(srqc,  srqc, page_offset));
-               MLX5_SET(wq,   wq, lwm,           MLX5_GET(srqc,  srqc, lwm));
-               MLX5_SET(wq,   wq, pd,            MLX5_GET(srqc,  srqc, pd));
-               MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(srqc,     srqc, dbr_addr));
-       } else {
-               switch (MLX5_GET(rmpc, rmpc, state)) {
-               case MLX5_RMPC_STATE_RDY:
-                       MLX5_SET(srqc, srqc, state, MLX5_SRQC_STATE_GOOD);
-                       break;
-               case MLX5_RMPC_STATE_ERR:
-                       MLX5_SET(srqc, srqc, state, MLX5_SRQC_STATE_ERROR);
-                       break;
-               default:
-                       pr_warn("%s: %d: Unknown rmp state = 0x%x\n",
-                               __func__, __LINE__,
-                               MLX5_GET(rmpc, rmpc, state));
-                       MLX5_SET(srqc, srqc, state,
-                                MLX5_GET(rmpc, rmpc, state));
-               }
-
-               MLX5_SET(srqc,   srqc, wq_signature,   MLX5_GET(wq,   wq, wq_signature));
-               MLX5_SET(srqc,   srqc, log_page_size,  MLX5_GET(wq,   wq, log_wq_pg_sz));
-               MLX5_SET(srqc,   srqc, log_rq_stride,  MLX5_GET(wq,   wq, log_wq_stride) - 4);
-               MLX5_SET(srqc,   srqc, log_srq_size,   MLX5_GET(wq,   wq, log_wq_sz));
-               MLX5_SET(srqc,   srqc, page_offset,    MLX5_GET(wq,   wq, page_offset));
-               MLX5_SET(srqc,   srqc, lwm,            MLX5_GET(wq,   wq, lwm));
-               MLX5_SET(srqc,   srqc, pd,             MLX5_GET(wq,   wq, pd));
-               MLX5_SET64(srqc, srqc, dbr_addr,       MLX5_GET64(wq, wq, dbr_addr));
-       }
+       MLX5_SET(wq,   wq, wq_signature,  !!(in->flags
+                & MLX5_SRQ_FLAG_WQ_SIG));
+       MLX5_SET(wq,   wq, log_wq_pg_sz,  in->log_page_size);
+       MLX5_SET(wq,   wq, log_wq_stride, in->wqe_shift + 4);
+       MLX5_SET(wq,   wq, log_wq_sz,     in->log_size);
+       MLX5_SET(wq,   wq, page_offset,   in->page_offset);
+       MLX5_SET(wq,   wq, lwm,           in->lwm);
+       MLX5_SET(wq,   wq, pd,            in->pd);
+       MLX5_SET64(wq, wq, dbr_addr,      in->db_record);
+}
+
+static void set_srqc(void *srqc, struct mlx5_srq_attr *in)
+{
+       MLX5_SET(srqc,   srqc, wq_signature,  !!(in->flags
+                & MLX5_SRQ_FLAG_WQ_SIG));
+       MLX5_SET(srqc,   srqc, log_page_size, in->log_page_size);
+       MLX5_SET(srqc,   srqc, log_rq_stride, in->wqe_shift);
+       MLX5_SET(srqc,   srqc, log_srq_size,  in->log_size);
+       MLX5_SET(srqc,   srqc, page_offset,   in->page_offset);
+       MLX5_SET(srqc,   srqc, lwm,           in->lwm);
+       MLX5_SET(srqc,   srqc, pd,            in->pd);
+       MLX5_SET64(srqc, srqc, dbr_addr,      in->db_record);
+       MLX5_SET(srqc,   srqc, xrcd,          in->xrcd);
+       MLX5_SET(srqc,   srqc, cqn,           in->cqn);
+}
+
+static void get_wq(void *wq, struct mlx5_srq_attr *in)
+{
+       if (MLX5_GET(wq, wq, wq_signature))
+               in->flags &= MLX5_SRQ_FLAG_WQ_SIG;
+       in->log_page_size = MLX5_GET(wq,   wq, log_wq_pg_sz);
+       in->wqe_shift     = MLX5_GET(wq,   wq, log_wq_stride) - 4;
+       in->log_size      = MLX5_GET(wq,   wq, log_wq_sz);
+       in->page_offset   = MLX5_GET(wq,   wq, page_offset);
+       in->lwm           = MLX5_GET(wq,   wq, lwm);
+       in->pd            = MLX5_GET(wq,   wq, pd);
+       in->db_record     = MLX5_GET64(wq, wq, dbr_addr);
+}
+
+static void get_srqc(void *srqc, struct mlx5_srq_attr *in)
+{
+       if (MLX5_GET(srqc, srqc, wq_signature))
+               in->flags &= MLX5_SRQ_FLAG_WQ_SIG;
+       in->log_page_size = MLX5_GET(srqc,   srqc, log_page_size);
+       in->wqe_shift     = MLX5_GET(srqc,   srqc, log_rq_stride);
+       in->log_size      = MLX5_GET(srqc,   srqc, log_srq_size);
+       in->page_offset   = MLX5_GET(srqc,   srqc, page_offset);
+       in->lwm           = MLX5_GET(srqc,   srqc, lwm);
+       in->pd            = MLX5_GET(srqc,   srqc, pd);
+       in->db_record     = MLX5_GET64(srqc, srqc, dbr_addr);
  }
  
  struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn)
@@ -149,19 +150,36 @@ struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn)
  EXPORT_SYMBOL(mlx5_core_get_srq);
  
  static int create_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                         struct mlx5_create_srq_mbox_in *in, int inlen)
+                         struct mlx5_srq_attr *in)
  {
-       struct mlx5_create_srq_mbox_out out;
+       u32 create_out[MLX5_ST_SZ_DW(create_srq_out)] = {0};
+       void *create_in;
+       void *srqc;
+       void *pas;
+       int pas_size;
+       int inlen;
         int err;
  
-       memset(&out, 0, sizeof(out));
+       pas_size  = get_pas_size(in);
+       inlen     = MLX5_ST_SZ_BYTES(create_srq_in) + pas_size;
+       create_in = mlx5_vzalloc(inlen);
+       if (!create_in)
+               return -ENOMEM;
+
+       srqc = MLX5_ADDR_OF(create_srq_in, create_in, srq_context_entry);
+       pas = MLX5_ADDR_OF(create_srq_in, create_in, pas);
  
-       in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_SRQ);
+       set_srqc(srqc, in);
+       memcpy(pas, in->pas, pas_size);
  
-       err = mlx5_cmd_exec_check_status(dev, (u32 *)in, inlen, (u32 *)(&out),
-                                        sizeof(out));
+       MLX5_SET(create_srq_in, create_in, opcode,
+                MLX5_CMD_OP_CREATE_SRQ);
  
-       srq->srqn = be32_to_cpu(out.srqn) & 0xffffff;
+       err = mlx5_cmd_exec_check_status(dev, create_in, inlen, create_out,
+                                        sizeof(create_out));
+       kvfree(create_in);
+       if (!err)
+               srq->srqn = MLX5_GET(create_srq_out, create_out, srqn);
  
         return err;
  }
@@ -169,67 +187,75 @@ static int create_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
  static int destroy_srq_cmd(struct mlx5_core_dev *dev,
                            struct mlx5_core_srq *srq)
  {
-       struct mlx5_destroy_srq_mbox_in in;
-       struct mlx5_destroy_srq_mbox_out out;
+       u32 srq_in[MLX5_ST_SZ_DW(destroy_srq_in)] = {0};
+       u32 srq_out[MLX5_ST_SZ_DW(destroy_srq_out)] = {0};
  
-       memset(&in, 0, sizeof(in));
-       memset(&out, 0, sizeof(out));
-       in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_SRQ);
-       in.srqn = cpu_to_be32(srq->srqn);
+       MLX5_SET(destroy_srq_in, srq_in, opcode,
+                MLX5_CMD_OP_DESTROY_SRQ);
+       MLX5_SET(destroy_srq_in, srq_in, srqn, srq->srqn);
  
-       return mlx5_cmd_exec_check_status(dev, (u32 *)(&in), sizeof(in),
-                                         (u32 *)(&out), sizeof(out));
+       return mlx5_cmd_exec_check_status(dev, srq_in, sizeof(srq_in),
+                                         srq_out, sizeof(srq_out));
  }
  
  static int arm_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
                        u16 lwm, int is_srq)
  {
-       struct mlx5_arm_srq_mbox_in     in;
-       struct mlx5_arm_srq_mbox_out    out;
-
-       memset(&in, 0, sizeof(in));
-       memset(&out, 0, sizeof(out));
+       /* arm_srq structs missing using identical xrc ones */
+       u32 srq_in[MLX5_ST_SZ_DW(arm_xrc_srq_in)] = {0};
+       u32 srq_out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0};
  
-       in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ARM_RQ);
-       in.hdr.opmod = cpu_to_be16(!!is_srq);
-       in.srqn = cpu_to_be32(srq->srqn);
-       in.lwm = cpu_to_be16(lwm);
+       MLX5_SET(arm_xrc_srq_in, srq_in, opcode,   MLX5_CMD_OP_ARM_XRC_SRQ);
+       MLX5_SET(arm_xrc_srq_in, srq_in, xrc_srqn, srq->srqn);
+       MLX5_SET(arm_xrc_srq_in, srq_in, lwm,      lwm);
  
-       return mlx5_cmd_exec_check_status(dev, (u32 *)(&in),
-                                         sizeof(in), (u32 *)(&out),
-                                         sizeof(out));
+       return  mlx5_cmd_exec_check_status(dev, srq_in, sizeof(srq_in),
+                                          srq_out, sizeof(srq_out));
  }
  
  static int query_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                        struct mlx5_query_srq_mbox_out *out)
+                        struct mlx5_srq_attr *out)
  {
-       struct mlx5_query_srq_mbox_in in;
+       u32 srq_in[MLX5_ST_SZ_DW(query_srq_in)] = {0};
+       u32 *srq_out;
+       void *srqc;
+       int err;
  
-       memset(&in, 0, sizeof(in));
+       srq_out = mlx5_vzalloc(MLX5_ST_SZ_BYTES(query_srq_out));
+       if (!srq_out)
+               return -ENOMEM;
  
-       in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SRQ);
-       in.srqn = cpu_to_be32(srq->srqn);
+       MLX5_SET(query_srq_in, srq_in, opcode,
+                MLX5_CMD_OP_QUERY_SRQ);
+       MLX5_SET(query_srq_in, srq_in, srqn, srq->srqn);
+       err =  mlx5_cmd_exec_check_status(dev, srq_in, sizeof(srq_in),
+                                         srq_out,
+                                         MLX5_ST_SZ_BYTES(query_srq_out));
+       if (err)
+               goto out;
  
-       return mlx5_cmd_exec_check_status(dev, (u32 *)(&in), sizeof(in),
-                                         (u32 *)out, sizeof(*out));
+       srqc = MLX5_ADDR_OF(query_srq_out, srq_out, srq_context_entry);
+       get_srqc(srqc, out);
+       if (MLX5_GET(srqc, srqc, state) != MLX5_SRQC_STATE_GOOD)
+               out->flags |= MLX5_SRQ_FLAG_ERR;
+out:
+       kvfree(srq_out);
+       return err;
  }
  
  static int create_xrc_srq_cmd(struct mlx5_core_dev *dev,
                               struct mlx5_core_srq *srq,
-                             struct mlx5_create_srq_mbox_in *in,
-                             int srq_inlen)
+                             struct mlx5_srq_attr *in)
  {
         u32 create_out[MLX5_ST_SZ_DW(create_xrc_srq_out)];
         void *create_in;
-       void *srqc;
         void *xrc_srqc;
         void *pas;
         int pas_size;
         int inlen;
         int err;
  
-       srqc      = MLX5_ADDR_OF(create_srq_in, in, srq_context_entry);
-       pas_size  = get_pas_size(srqc);
+       pas_size  = get_pas_size(in);
         inlen     = MLX5_ST_SZ_BYTES(create_xrc_srq_in) + pas_size;
         create_in = mlx5_vzalloc(inlen);
         if (!create_in)
@@ -239,7 +265,8 @@ static int create_xrc_srq_cmd(struct mlx5_core_dev *dev,
                                 xrc_srq_context_entry);
         pas      = MLX5_ADDR_OF(create_xrc_srq_in, create_in, pas);
  
-       memcpy(xrc_srqc, srqc, MLX5_ST_SZ_BYTES(srqc));
+       set_srqc(xrc_srqc, in);
+       MLX5_SET(xrc_srqc, xrc_srqc, user_index, in->user_index);
         memcpy(pas, in->pas, pas_size);
         MLX5_SET(create_xrc_srq_in, create_in, opcode,
                  MLX5_CMD_OP_CREATE_XRC_SRQ);
@@ -293,11 +320,10 @@ static int arm_xrc_srq_cmd(struct mlx5_core_dev *dev,
  
  static int query_xrc_srq_cmd(struct mlx5_core_dev *dev,
                              struct mlx5_core_srq *srq,
-                            struct mlx5_query_srq_mbox_out *out)
+                            struct mlx5_srq_attr *out)
  {
         u32 xrcsrq_in[MLX5_ST_SZ_DW(query_xrc_srq_in)];
         u32 *xrcsrq_out;
-       void *srqc;
         void *xrc_srqc;
         int err;
  
@@ -317,8 +343,9 @@ static int query_xrc_srq_cmd(struct mlx5_core_dev *dev,
  
         xrc_srqc = MLX5_ADDR_OF(query_xrc_srq_out, xrcsrq_out,
                                 xrc_srq_context_entry);
-       srqc = MLX5_ADDR_OF(query_srq_out, out, srq_context_entry);
-       memcpy(srqc, xrc_srqc, MLX5_ST_SZ_BYTES(srqc));
+       get_srqc(xrc_srqc, out);
+       if (MLX5_GET(xrc_srqc, xrc_srqc, state) != MLX5_XRC_SRQC_STATE_GOOD)
+               out->flags |= MLX5_SRQ_FLAG_ERR;
  
  out:
         kvfree(xrcsrq_out);
@@ -326,26 +353,27 @@ out:
  }
  
  static int create_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                         struct mlx5_create_srq_mbox_in *in, int srq_inlen)
+                         struct mlx5_srq_attr *in)
  {
         void *create_in;
         void *rmpc;
-       void *srqc;
+       void *wq;
         int pas_size;
         int inlen;
         int err;
  
-       srqc = MLX5_ADDR_OF(create_srq_in, in, srq_context_entry);
-       pas_size = get_pas_size(srqc);
+       pas_size = get_pas_size(in);
         inlen = MLX5_ST_SZ_BYTES(create_rmp_in) + pas_size;
         create_in = mlx5_vzalloc(inlen);
         if (!create_in)
                 return -ENOMEM;
  
         rmpc = MLX5_ADDR_OF(create_rmp_in, create_in, ctx);
+       wq = MLX5_ADDR_OF(rmpc, rmpc, wq);
  
+       MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
+       set_wq(wq, in);
         memcpy(MLX5_ADDR_OF(rmpc, rmpc, wq.pas), in->pas, pas_size);
-       rmpc_srqc_reformat(srqc, rmpc, true);
  
         err = mlx5_core_create_rmp(dev, create_in, inlen, &srq->srqn);
  
@@ -390,11 +418,10 @@ static int arm_rmp_cmd(struct mlx5_core_dev *dev,
  }
  
  static int query_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                        struct mlx5_query_srq_mbox_out *out)
+                        struct mlx5_srq_attr *out)
  {
         u32 *rmp_out;
         void *rmpc;
-       void *srqc;
         int err;
  
         rmp_out =  mlx5_vzalloc(MLX5_ST_SZ_BYTES(query_rmp_out));
@@ -405,9 +432,10 @@ static int query_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
         if (err)
                 goto out;
  
-       srqc = MLX5_ADDR_OF(query_srq_out, out,     srq_context_entry);
         rmpc = MLX5_ADDR_OF(query_rmp_out, rmp_out, rmp_context);
-       rmpc_srqc_reformat(srqc, rmpc, false);
+       get_wq(MLX5_ADDR_OF(rmpc, rmpc, wq), out);
+       if (MLX5_GET(rmpc, rmpc, state) != MLX5_RMPC_STATE_RDY)
+               out->flags |= MLX5_SRQ_FLAG_ERR;
  
  out:
         kvfree(rmp_out);
@@ -416,15 +444,14 @@ out:
  
  static int create_srq_split(struct mlx5_core_dev *dev,
                             struct mlx5_core_srq *srq,
-                           struct mlx5_create_srq_mbox_in *in,
-                           int inlen, int is_xrc)
+                           struct mlx5_srq_attr *in)
  {
         if (!dev->issi)
-               return create_srq_cmd(dev, srq, in, inlen);
+               return create_srq_cmd(dev, srq, in);
         else if (srq->common.res == MLX5_RES_XSRQ)
-               return create_xrc_srq_cmd(dev, srq, in, inlen);
+               return create_xrc_srq_cmd(dev, srq, in);
         else
-               return create_rmp_cmd(dev, srq, in, inlen);
+               return create_rmp_cmd(dev, srq, in);
  }
  
  static int destroy_srq_split(struct mlx5_core_dev *dev,
@@ -439,15 +466,17 @@ static int destroy_srq_split(struct mlx5_core_dev *dev,
  }
  
  int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                        struct mlx5_create_srq_mbox_in *in, int inlen,
-                        int is_xrc)
+                        struct mlx5_srq_attr *in)
  {
         int err;
         struct mlx5_srq_table *table = &dev->priv.srq_table;
  
-       srq->common.res = is_xrc ? MLX5_RES_XSRQ : MLX5_RES_SRQ;
+       if (in->type == IB_SRQT_XRC)
+               srq->common.res = MLX5_RES_XSRQ;
+       else
+               srq->common.res = MLX5_RES_SRQ;
  
-       err = create_srq_split(dev, srq, in, inlen, is_xrc);
+       err = create_srq_split(dev, srq, in);
         if (err)
                 return err;
  
@@ -502,7 +531,7 @@ int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq)
  EXPORT_SYMBOL(mlx5_core_destroy_srq);
  
  int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                       struct mlx5_query_srq_mbox_out *out)
+                       struct mlx5_srq_attr *out)
  {
         if (!dev->issi)
                 return query_srq_cmd(dev, srq, out);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c

index 03a5093ffeb72ab6f5c490d1eff42967ceb7f0f4..28274a6fbafe62e45baf8639d05c216fe2d8e61b 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -85,6 +85,7 @@ int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn)
  
         return err;
  }
+EXPORT_SYMBOL(mlx5_core_create_rq);
  
  int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen)
  {
@@ -110,6 +111,7 @@ void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn)
  
         mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
  }
+EXPORT_SYMBOL(mlx5_core_destroy_rq);
  
  int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out)
  {
@@ -430,6 +432,7 @@ int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen,
  
         return err;
  }
+EXPORT_SYMBOL(mlx5_core_create_rqt);
  
  int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in,
                          int inlen)
@@ -455,3 +458,4 @@ void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn)
  
         mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
  }
+EXPORT_SYMBOL(mlx5_core_destroy_rqt);
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c

index fa49f9143b80631108e10ef78e5e45e285c40cf8..a46b585fae316f2f987080d7bc8dec976089af6b 100644 (file)
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -675,6 +675,9 @@ static void acpiphp_check_bridge(struct acpiphp_bridge *bridge)
         if (bridge->is_going_away)
                 return;
  
+       if (bridge->pci_dev)
+               pm_runtime_get_sync(&bridge->pci_dev->dev);
+
         list_for_each_entry(slot, &bridge->slots, node) {
                 struct pci_bus *bus = slot->bus;
                 struct pci_dev *dev, *tmp;
@@ -694,6 +697,9 @@ static void acpiphp_check_bridge(struct acpiphp_bridge *bridge)
                         disable_slot(slot);
                 }
         }
+
+       if (bridge->pci_dev)
+               pm_runtime_put(&bridge->pci_dev->dev);
  }
  
  /*
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c

index bedb361746a03d99cadcc3fcc620acd5f6907408..c38a5b9733c8f11b049edaf25a66445433e497f4 100644 (file)
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -60,6 +60,7 @@
  #include <linux/delay.h>
  #include <linux/acpi.h>
  #include <linux/freezer.h>
+#include <linux/kmod.h>
  #include <linux/kthread.h>
  
  #include <asm/page.h>
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig

index 18639e0cb6e2b5a5f32548dd1dc7652df19d875b..e215f50794b68f857ba4d9cfb86755727735dec1 100644 (file)
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -5,6 +5,10 @@
  config RTC_LIB
         bool
  
+config RTC_MC146818_LIB
+       bool
+       select RTC_LIB
+
  menuconfig RTC_CLASS
         bool "Real Time Clock"
         default n
@@ -574,10 +578,10 @@ config RTC_DRV_EM3027
           will be called rtc-em3027.
  
  config RTC_DRV_RV8803
-       tristate "Micro Crystal RV8803"
+       tristate "Micro Crystal RV8803, Epson RX8900"
         help
-         If you say yes here you get support for the Micro Crystal
-         RV8803 RTC chips.
+         If you say yes here you get support for the Micro Crystal RV8803 and
+         Epson RX8900 RTC chips.
  
           This driver can also be built as a module. If so, the module
           will be called rtc-rv8803.
@@ -670,6 +674,18 @@ config RTC_DRV_DS1390
           This driver can also be built as a module. If so, the module
           will be called rtc-ds1390.
  
+config RTC_DRV_MAX6916
+       tristate "Maxim MAX6916"
+       help
+         If you say yes here you will get support for the
+         Maxim MAX6916 SPI RTC chip.
+
+         This driver only supports the RTC feature, and not other chip
+         features such as alarms.
+
+         This driver can also be built as a module. If so, the module
+         will be called rtc-max6916.
+
  config RTC_DRV_R9701
         tristate "Epson RTC-9701JE"
         help
@@ -795,8 +811,9 @@ comment "Platform RTC drivers"
  
  config RTC_DRV_CMOS
         tristate "PC-style 'CMOS'"
-       depends on X86 || ARM || M32R || PPC || MIPS || SPARC64
+       depends on X86 || ARM || M32R || PPC || MIPS || SPARC64 || MN10300
         default y if X86
+       select RTC_MC146818_LIB
         help
           Say "yes" here to get direct support for the real time clock
           found in every PC or ACPI-based system, and some other boards.
@@ -815,6 +832,7 @@ config RTC_DRV_CMOS
  config RTC_DRV_ALPHA
         bool "Alpha PC-style CMOS"
         depends on ALPHA
+       select RTC_MC146818_LIB
         default y
         help
           Direct support for the real-time clock found on every Alpha
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile

index ea2833723fa9b561c840757727cc98b4b8ab869d..7cf7ad559c79b82a0b1fc08f2f3e1fefe47795d3 100644 (file)
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_RTC_LIB)           += rtc-lib.o
  obj-$(CONFIG_RTC_HCTOSYS)      += hctosys.o
  obj-$(CONFIG_RTC_SYSTOHC)      += systohc.o
  obj-$(CONFIG_RTC_CLASS)                += rtc-core.o
+obj-$(CONFIG_RTC_MC146818_LIB) += rtc-mc146818-lib.o
  rtc-core-y                     := class.o interface.o
  
  ifdef CONFIG_RTC_DRV_EFI
@@ -85,6 +86,7 @@ obj-$(CONFIG_RTC_DRV_M48T59)  += rtc-m48t59.o
  obj-$(CONFIG_RTC_DRV_M48T86)   += rtc-m48t86.o
  obj-$(CONFIG_RTC_DRV_MAX6900)  += rtc-max6900.o
  obj-$(CONFIG_RTC_DRV_MAX6902)  += rtc-max6902.o
+obj-$(CONFIG_RTC_DRV_MAX6916)  += rtc-max6916.o
  obj-$(CONFIG_RTC_DRV_MAX77686) += rtc-max77686.o
  obj-$(CONFIG_RTC_DRV_MAX8907)  += rtc-max8907.o
  obj-$(CONFIG_RTC_DRV_MAX8925)  += rtc-max8925.o
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c

index 9ef5f6f89f98af8d97da7595aa4ce5a90c6d2566..84a52db9b05f905bae3c294b88d116e77327f8c5 100644 (file)
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -104,7 +104,17 @@ static int rtc_read_alarm_internal(struct rtc_device *rtc, struct rtc_wkalrm *al
         else if (!rtc->ops->read_alarm)
                 err = -EINVAL;
         else {
-               memset(alarm, 0, sizeof(struct rtc_wkalrm));
+               alarm->enabled = 0;
+               alarm->pending = 0;
+               alarm->time.tm_sec = -1;
+               alarm->time.tm_min = -1;
+               alarm->time.tm_hour = -1;
+               alarm->time.tm_mday = -1;
+               alarm->time.tm_mon = -1;
+               alarm->time.tm_year = -1;
+               alarm->time.tm_wday = -1;
+               alarm->time.tm_yday = -1;
+               alarm->time.tm_isdst = -1;
                 err = rtc->ops->read_alarm(rtc->dev.parent, alarm);
         }
  
@@ -383,7 +393,7 @@ int rtc_initialize_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
         rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time);
         rtc->aie_timer.period = ktime_set(0, 0);
  
-       /* Alarm has to be enabled & in the futrure for us to enqueue it */
+       /* Alarm has to be enabled & in the future for us to enqueue it */
         if (alarm->enabled && (rtc_tm_to_ktime(now).tv64 <
                          rtc->aie_timer.node.expires.tv64)) {
  
@@ -395,8 +405,6 @@ int rtc_initialize_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
  }
  EXPORT_SYMBOL_GPL(rtc_initialize_alarm);
  
-
-
  int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled)
  {
         int err = mutex_lock_interruptible(&rtc->ops_lock);
@@ -748,9 +756,23 @@ EXPORT_SYMBOL_GPL(rtc_irq_set_freq);
   */
  static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
  {
+       struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue);
+       struct rtc_time tm;
+       ktime_t now;
+
         timer->enabled = 1;
+       __rtc_read_time(rtc, &tm);
+       now = rtc_tm_to_ktime(tm);
+
+       /* Skip over expired timers */
+       while (next) {
+               if (next->expires.tv64 >= now.tv64)
+                       break;
+               next = timerqueue_iterate_next(next);
+       }
+
         timerqueue_add(&rtc->timerqueue, &timer->node);
-       if (&timer->node == timerqueue_getnext(&rtc->timerqueue)) {
+       if (!next) {
                 struct rtc_wkalrm alarm;
                 int err;
                 alarm.time = rtc_ktime_to_tm(timer->node.expires);
diff --git a/drivers/rtc/rtc-abx80x.c b/drivers/rtc/rtc-abx80x.c

index ba0d61934d35aefbc3fca7ded4194c8c3a49862a..fea9a60b06cf6dd25b1a078bf6e812265aa0acc1 100644 (file)
--- a/drivers/rtc/rtc-abx80x.c
+++ b/drivers/rtc/rtc-abx80x.c
@@ -643,17 +643,15 @@ static int abx80x_probe(struct i2c_client *client,
                 return err;
         }
  
-       err = devm_add_action(&client->dev, rtc_calib_remove_sysfs_group,
-                             &client->dev);
-       if (err) {
-               rtc_calib_remove_sysfs_group(&client->dev);
+       err = devm_add_action_or_reset(&client->dev,
+                                      rtc_calib_remove_sysfs_group,
+                                      &client->dev);
+       if (err)
                 dev_err(&client->dev,
                         "Failed to add sysfs cleanup action: %d\n",
                         err);
-               return err;
-       }
  
-       return 0;
+       return err;
  }
  
  static int abx80x_remove(struct i2c_client *client)
diff --git a/drivers/rtc/rtc-asm9260.c b/drivers/rtc/rtc-asm9260.c

index 355fdb97a00638e29cafb1209f267083b1ece573..5219916ce11d2b609bde574a4b99f72d279eb8e9 100644 (file)
--- a/drivers/rtc/rtc-asm9260.c
+++ b/drivers/rtc/rtc-asm9260.c
@@ -343,7 +343,6 @@ static struct platform_driver asm9260_rtc_driver = {
         .remove         = asm9260_rtc_remove,
         .driver         = {
                 .name   = "asm9260-rtc",
-               .owner  = THIS_MODULE,
                 .of_match_table = asm9260_dt_ids,
         },
  };
diff --git a/drivers/rtc/rtc-at91sam9.c b/drivers/rtc/rtc-at91sam9.c

index 99732e6f8c3b295a9647c5d61caf4779b31d6adf..7418a763ce5202b37f96b0cf26863400aa953982 100644 (file)
--- a/drivers/rtc/rtc-at91sam9.c
+++ b/drivers/rtc/rtc-at91sam9.c
@@ -375,6 +375,7 @@ static int at91_rtc_probe(struct platform_device *pdev)
         if (!rtc)
                 return -ENOMEM;
  
+       spin_lock_init(&rtc->lock);
         rtc->irq = irq;
  
         /* platform setup code should have handled this; sigh */
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c

index fbe9c72438e19966ab7a94ae13c5221811fbe25b..43745cac0141a4445dda3abe6a31d8ca172355b0 100644 (file)
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -43,7 +43,7 @@
  #include <linux/of_platform.h>
  
  /* this is for "generic access to PC-style RTC" using CMOS_READ/CMOS_WRITE */
-#include <asm-generic/rtc.h>
+#include <linux/mc146818rtc.h>
  
  struct cmos_rtc {
         struct rtc_device       *rtc;
@@ -190,10 +190,10 @@ static inline void cmos_write_bank2(unsigned char val, unsigned char addr)
  static int cmos_read_time(struct device *dev, struct rtc_time *t)
  {
         /* REVISIT:  if the clock has a "century" register, use
-        * that instead of the heuristic in get_rtc_time().
+        * that instead of the heuristic in mc146818_get_time().
          * That'll make Y3K compatility (year > 2070) easy!
          */
-       get_rtc_time(t);
+       mc146818_get_time(t);
         return 0;
  }
  
@@ -205,7 +205,7 @@ static int cmos_set_time(struct device *dev, struct rtc_time *t)
          * takes effect exactly 500ms after we write the register.
          * (Also queueing and other delays before we get this far.)
          */
-       return set_rtc_time(t);
+       return mc146818_set_time(t);
  }
  
  static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t)
@@ -220,8 +220,6 @@ static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t)
          * Some also support day and month, for alarms up to a year in
          * the future.
          */
-       t->time.tm_mday = -1;
-       t->time.tm_mon = -1;
  
         spin_lock_irq(&rtc_lock);
         t->time.tm_sec = CMOS_READ(RTC_SECONDS_ALARM);
@@ -272,7 +270,6 @@ static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t)
                         }
                 }
         }
-       t->time.tm_year = -1;
  
         t->enabled = !!(rtc_control & RTC_AIE);
         t->pending = 0;
@@ -630,7 +627,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
         address_space = 64;
  #elif defined(__i386__) || defined(__x86_64__) || defined(__arm__) \
                         || defined(__sparc__) || defined(__mips__) \
-                       || defined(__powerpc__)
+                       || defined(__powerpc__) || defined(CONFIG_MN10300)
         address_space = 128;
  #else
  #warning Assuming 128 bytes of RTC+NVRAM address space, not 64 bytes.
@@ -1142,14 +1139,14 @@ static __init void cmos_of_init(struct platform_device *pdev)
         if (val)
                 CMOS_WRITE(be32_to_cpup(val), RTC_FREQ_SELECT);
  
-       get_rtc_time(&time);
+       cmos_read_time(&pdev->dev, &time);
         ret = rtc_valid_tm(&time);
         if (ret) {
                 struct rtc_time def_time = {
                         .tm_year = 1,
                         .tm_mday = 1,
                 };
-               set_rtc_time(&def_time);
+               cmos_set_time(&pdev->dev, &def_time);
         }
  }
  #else
diff --git a/drivers/rtc/rtc-da9052.c b/drivers/rtc/rtc-da9052.c

index a20bcf0e33cd10a1c0f43918c759f4d52856a07a..4273377562ec64dcdf640506fe4b4dd83b413de0 100644 (file)
--- a/drivers/rtc/rtc-da9052.c
+++ b/drivers/rtc/rtc-da9052.c
@@ -85,6 +85,7 @@ static int da9052_read_alarm(struct da9052_rtc *rtc, struct rtc_time *rtc_tm)
                         rtc_tm->tm_mday = v[0][2] & DA9052_RTC_DAY;
                         rtc_tm->tm_hour = v[0][1] & DA9052_RTC_HOUR;
                         rtc_tm->tm_min  = v[0][0] & DA9052_RTC_MIN;
+                       rtc_tm->tm_sec = 0;
  
                         ret = rtc_valid_tm(rtc_tm);
                         return ret;
diff --git a/drivers/rtc/rtc-da9055.c b/drivers/rtc/rtc-da9055.c

index 7ec0872d5e3b1bd3a585abb9e7a8aaad3aeafa2f..678af8648c45316f22ec7e4d124d1758436022d6 100644 (file)
--- a/drivers/rtc/rtc-da9055.c
+++ b/drivers/rtc/rtc-da9055.c
@@ -74,6 +74,7 @@ static int da9055_read_alarm(struct da9055 *da9055, struct rtc_time *rtc_tm)
         rtc_tm->tm_mday = v[2] & DA9055_RTC_ALM_DAY;
         rtc_tm->tm_hour = v[1] & DA9055_RTC_ALM_HOUR;
         rtc_tm->tm_min  = v[0] & DA9055_RTC_ALM_MIN;
+       rtc_tm->tm_sec = 0;
  
         return rtc_valid_tm(rtc_tm);
  }
diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c

index c5432bf64e1c317324f009353ca7961e97e8fb63..dba60c1dfce2ee9f71fefc6b023a8eee799e63d2 100644 (file)
--- a/drivers/rtc/rtc-davinci.c
+++ b/drivers/rtc/rtc-davinci.c
@@ -388,6 +388,8 @@ static int davinci_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
         u8 day0, day1;
         unsigned long flags;
  
+       alm->time.tm_sec = 0;
+
         spin_lock_irqsave(&davinci_rtc_lock, flags);
  
         davinci_rtcss_calendar_wait(davinci_rtc);
diff --git a/drivers/rtc/rtc-ds1286.c b/drivers/rtc/rtc-ds1286.c

index 756e509f6ed267539c4b26bb415238bc754098a1..ef75c349dff9cf3da5a5c2c692dc4e61b3fe79bf 100644 (file)
--- a/drivers/rtc/rtc-ds1286.c
+++ b/drivers/rtc/rtc-ds1286.c
@@ -16,7 +16,7 @@
  #include <linux/rtc.h>
  #include <linux/platform_device.h>
  #include <linux/bcd.h>
-#include <linux/ds1286.h>
+#include <linux/rtc/ds1286.h>
  #include <linux/io.h>
  #include <linux/slab.h>
  
diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c

index 8e41c4613e5135cbd4db573586f85ef60678447e..72b22935eb62a0e9d71da4552ab64d0ace6ccd39 100644 (file)
--- a/drivers/rtc/rtc-ds1305.c
+++ b/drivers/rtc/rtc-ds1305.c
@@ -313,13 +313,6 @@ static int ds1305_get_alarm(struct device *dev, struct rtc_wkalrm *alm)
         alm->time.tm_sec = bcd2bin(buf[DS1305_SEC]);
         alm->time.tm_min = bcd2bin(buf[DS1305_MIN]);
         alm->time.tm_hour = bcd2hour(buf[DS1305_HOUR]);
-       alm->time.tm_mday = -1;
-       alm->time.tm_mon = -1;
-       alm->time.tm_year = -1;
-       /* next three fields are unused by Linux */
-       alm->time.tm_wday = -1;
-       alm->time.tm_mday = -1;
-       alm->time.tm_isdst = -1;
  
         return 0;
  }
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c

index 821d9c089cdb48a40a244d1c18ab912d6f0b9432..8e1c5cb6ece6f60619e5cafcea4271e056d96778 100644 (file)
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -482,11 +482,6 @@ static int ds1337_read_alarm(struct device *dev, struct rtc_wkalrm *t)
         t->time.tm_min = bcd2bin(ds1307->regs[1] & 0x7f);
         t->time.tm_hour = bcd2bin(ds1307->regs[2] & 0x3f);
         t->time.tm_mday = bcd2bin(ds1307->regs[3] & 0x3f);
-       t->time.tm_mon = -1;
-       t->time.tm_year = -1;
-       t->time.tm_wday = -1;
-       t->time.tm_yday = -1;
-       t->time.tm_isdst = -1;
  
         /* ... and status */
         t->enabled = !!(ds1307->regs[7] & DS1337_BIT_A1IE);
@@ -602,6 +597,8 @@ static const struct rtc_class_ops ds13xx_rtc_ops = {
   * Alarm support for mcp794xx devices.
   */
  
+#define MCP794XX_REG_WEEKDAY           0x3
+#define MCP794XX_REG_WEEKDAY_WDAY_MASK 0x7
  #define MCP794XX_REG_CONTROL           0x07
  #      define MCP794XX_BIT_ALM0_EN     0x10
  #      define MCP794XX_BIT_ALM1_EN     0x20
@@ -1231,13 +1228,16 @@ static int ds1307_probe(struct i2c_client *client,
  {
         struct ds1307           *ds1307;
         int                     err = -ENODEV;
-       int                     tmp;
+       int                     tmp, wday;
         struct chip_desc        *chip = &chips[id->driver_data];
         struct i2c_adapter      *adapter = to_i2c_adapter(client->dev.parent);
         bool                    want_irq = false;
         bool                    ds1307_can_wakeup_device = false;
         unsigned char           *buf;
         struct ds1307_platform_data *pdata = dev_get_platdata(&client->dev);
+       struct rtc_time         tm;
+       unsigned long           timestamp;
+
         irq_handler_t   irq_handler = ds1307_irq;
  
         static const int        bbsqi_bitpos[] = {
@@ -1526,6 +1526,27 @@ read_rtc:
                                 bin2bcd(tmp));
         }
  
+       /*
+        * Some IPs have weekday reset value = 0x1 which might not correct
+        * hence compute the wday using the current date/month/year values
+        */
+       ds1307_get_time(&client->dev, &tm);
+       wday = tm.tm_wday;
+       timestamp = rtc_tm_to_time64(&tm);
+       rtc_time64_to_tm(timestamp, &tm);
+
+       /*
+        * Check if reset wday is different from the computed wday
+        * If different then set the wday which we computed using
+        * timestamp
+        */
+       if (wday != tm.tm_wday) {
+               wday = i2c_smbus_read_byte_data(client, MCP794XX_REG_WEEKDAY);
+               wday = wday & ~MCP794XX_REG_WEEKDAY_WDAY_MASK;
+               wday = wday | (tm.tm_wday + 1);
+               i2c_smbus_write_byte_data(client, MCP794XX_REG_WEEKDAY, wday);
+       }
+
         if (want_irq) {
                 device_set_wakeup_capable(&client->dev, true);
                 set_bit(HAS_ALARM, &ds1307->flags);
diff --git a/drivers/rtc/rtc-ds1343.c b/drivers/rtc/rtc-ds1343.c

index 23fa9f0cb5e3113f398a468d7c3a941f0bf45a98..895fbeeb47fe1bc78fc79350e6e58b23c50ab64e 100644 (file)
--- a/drivers/rtc/rtc-ds1343.c
+++ b/drivers/rtc/rtc-ds1343.c
@@ -504,12 +504,6 @@ static int ds1343_read_alarm(struct device *dev, struct rtc_wkalrm *alarm)
         alarm->time.tm_hour = priv->alarm_hour < 0 ? 0 : priv->alarm_hour;
         alarm->time.tm_mday = priv->alarm_mday < 0 ? 0 : priv->alarm_mday;
  
-       alarm->time.tm_mon = -1;
-       alarm->time.tm_year = -1;
-       alarm->time.tm_wday = -1;
-       alarm->time.tm_yday = -1;
-       alarm->time.tm_isdst = -1;
-
  out:
         mutex_unlock(&priv->mutex);
         return res;
diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c

index b3ce3c652fcd0a9211690533ed9a940978fa67f8..ed43b431166064770c7486498f75a55a10fe5689 100644 (file)
--- a/drivers/rtc/rtc-ds1685.c
+++ b/drivers/rtc/rtc-ds1685.c
@@ -102,6 +102,26 @@ ds1685_rtc_bin2bcd(struct ds1685_priv *rtc, u8 val, u8 bin_mask, u8 bcd_mask)
         return (val & bin_mask);
  }
  
+/**
+ * s1685_rtc_check_mday - check validity of the day of month.
+ * @rtc: pointer to the ds1685 rtc structure.
+ * @mday: day of month.
+ *
+ * Returns -EDOM if the day of month is not within 1..31 range.
+ */
+static inline int
+ds1685_rtc_check_mday(struct ds1685_priv *rtc, u8 mday)
+{
+       if (rtc->bcd_mode) {
+               if (mday < 0x01 || mday > 0x31 || (mday & 0x0f) > 0x09)
+                       return -EDOM;
+       } else {
+               if (mday < 1 || mday > 31)
+                       return -EDOM;
+       }
+       return 0;
+}
+
  /**
   * ds1685_rtc_switch_to_bank0 - switch the rtc to bank 0.
   * @rtc: pointer to the ds1685 rtc structure.
@@ -377,6 +397,7 @@ ds1685_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
         struct platform_device *pdev = to_platform_device(dev);
         struct ds1685_priv *rtc = platform_get_drvdata(pdev);
         u8 seconds, minutes, hours, mday, ctrlb, ctrlc;
+       int ret;
  
         /* Fetch the alarm info from the RTC alarm registers. */
         ds1685_rtc_begin_data_access(rtc);
@@ -388,34 +409,29 @@ ds1685_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
         ctrlc   = rtc->read(rtc, RTC_CTRL_C);
         ds1685_rtc_end_data_access(rtc);
  
-       /* Check month date. */
-       if (!(mday >= 1) && (mday <= 31))
-               return -EDOM;
+       /* Check the month date for validity. */
+       ret = ds1685_rtc_check_mday(rtc, mday);
+       if (ret)
+               return ret;
  
         /*
          * Check the three alarm bytes.
          *
          * The Linux RTC system doesn't support the "don't care" capability
          * of this RTC chip.  We check for it anyways in case support is
-        * added in the future.
+        * added in the future and only assign when we care.
          */
-       if (unlikely(seconds >= 0xc0))
-               alrm->time.tm_sec = -1;
-       else
+       if (likely(seconds < 0xc0))
                 alrm->time.tm_sec = ds1685_rtc_bcd2bin(rtc, seconds,
                                                        RTC_SECS_BCD_MASK,
                                                        RTC_SECS_BIN_MASK);
  
-       if (unlikely(minutes >= 0xc0))
-               alrm->time.tm_min = -1;
-       else
+       if (likely(minutes < 0xc0))
                 alrm->time.tm_min = ds1685_rtc_bcd2bin(rtc, minutes,
                                                        RTC_MINS_BCD_MASK,
                                                        RTC_MINS_BIN_MASK);
  
-       if (unlikely(hours >= 0xc0))
-               alrm->time.tm_hour = -1;
-       else
+       if (likely(hours < 0xc0))
                 alrm->time.tm_hour = ds1685_rtc_bcd2bin(rtc, hours,
                                                         RTC_HRS_24_BCD_MASK,
                                                         RTC_HRS_24_BIN_MASK);
@@ -423,11 +439,6 @@ ds1685_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
         /* Write the data to rtc_wkalrm. */
         alrm->time.tm_mday = ds1685_rtc_bcd2bin(rtc, mday, RTC_MDAY_BCD_MASK,
                                                 RTC_MDAY_BIN_MASK);
-       alrm->time.tm_mon = -1;
-       alrm->time.tm_year = -1;
-       alrm->time.tm_wday = -1;
-       alrm->time.tm_yday = -1;
-       alrm->time.tm_isdst = -1;
         alrm->enabled = !!(ctrlb & RTC_CTRL_B_AIE);
         alrm->pending = !!(ctrlc & RTC_CTRL_C_AF);
  
@@ -445,6 +456,7 @@ ds1685_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
         struct platform_device *pdev = to_platform_device(dev);
         struct ds1685_priv *rtc = platform_get_drvdata(pdev);
         u8 ctrlb, seconds, minutes, hours, mday;
+       int ret;
  
         /* Fetch the alarm info and convert to BCD. */
         seconds = ds1685_rtc_bin2bcd(rtc, alrm->time.tm_sec,
@@ -461,8 +473,9 @@ ds1685_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
                                      RTC_MDAY_BCD_MASK);
  
         /* Check the month date for validity. */
-       if (!(mday >= 1) && (mday <= 31))
-               return -EDOM;
+       ret = ds1685_rtc_check_mday(rtc, mday);
+       if (ret)
+               return ret;
  
         /*
          * Check the three alarm bytes.
diff --git a/drivers/rtc/rtc-ds2404.c b/drivers/rtc/rtc-ds2404.c

index 16310fe79d76f83b16fffde982395087dd405b70..9a1582ed7070a0ffc9d75d86f353d8383b7594ab 100644 (file)
--- a/drivers/rtc/rtc-ds2404.c
+++ b/drivers/rtc/rtc-ds2404.c
@@ -13,7 +13,7 @@
  #include <linux/rtc.h>
  #include <linux/types.h>
  #include <linux/bcd.h>
-#include <linux/rtc-ds2404.h>
+#include <linux/platform_data/rtc-ds2404.h>
  #include <linux/delay.h>
  #include <linux/gpio.h>
  #include <linux/slab.h>
diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c

index 04fbd7fffd0d8f2af296b35ac3dcb049c5cd8002..b1f20d8c358fd5c2a223563c46e302d221e575e9 100644 (file)
--- a/drivers/rtc/rtc-ds3232.c
+++ b/drivers/rtc/rtc-ds3232.c
@@ -197,12 +197,6 @@ static int ds3232_read_alarm(struct device *dev, struct rtc_wkalrm *alarm)
         alarm->time.tm_hour = bcd2bin(buf[2] & 0x7F);
         alarm->time.tm_mday = bcd2bin(buf[3] & 0x7F);
  
-       alarm->time.tm_mon = -1;
-       alarm->time.tm_year = -1;
-       alarm->time.tm_wday = -1;
-       alarm->time.tm_yday = -1;
-       alarm->time.tm_isdst = -1;
-
         alarm->enabled = !!(control & DS3232_REG_CR_A1IE);
         alarm->pending = !!(stat & DS3232_REG_SR_A1F);
  
diff --git a/drivers/rtc/rtc-efi.c b/drivers/rtc/rtc-efi.c

index 96d38609d803a90bd4beeea4395949a273c12e9b..0130afd7fe889e5767660292c2d2f87d58ad2725 100644 (file)
--- a/drivers/rtc/rtc-efi.c
+++ b/drivers/rtc/rtc-efi.c
@@ -259,6 +259,12 @@ static const struct rtc_class_ops efi_rtc_ops = {
  static int __init efi_rtc_probe(struct platform_device *dev)
  {
         struct rtc_device *rtc;
+       efi_time_t eft;
+       efi_time_cap_t cap;
+
+       /* First check if the RTC is usable */
+       if (efi.get_time(&eft, &cap) != EFI_SUCCESS)
+               return -ENODEV;
  
         rtc = devm_rtc_device_register(&dev->dev, "rtc-efi", &efi_rtc_ops,
                                         THIS_MODULE);
diff --git a/drivers/rtc/rtc-generic.c b/drivers/rtc/rtc-generic.c

index d726c6aa96a8fab1f69859cc364d969522d93be0..1bf5d23479285f1f29dba5fc43efa9bdba40f6e4 100644 (file)
--- a/drivers/rtc/rtc-generic.c
+++ b/drivers/rtc/rtc-generic.c
@@ -9,44 +9,10 @@
  #include <linux/platform_device.h>
  #include <linux/rtc.h>
  
-#if defined(CONFIG_M68K) || defined(CONFIG_PARISC) || \
-    defined(CONFIG_PPC) || defined(CONFIG_SUPERH32)
-#include <asm/rtc.h>
-
-static int generic_get_time(struct device *dev, struct rtc_time *tm)
-{
-       unsigned int ret = get_rtc_time(tm);
-
-       if (ret & RTC_BATT_BAD)
-               return -EOPNOTSUPP;
-
-       return rtc_valid_tm(tm);
-}
-
-static int generic_set_time(struct device *dev, struct rtc_time *tm)
-{
-       if (set_rtc_time(tm) < 0)
-               return -EOPNOTSUPP;
-
-       return 0;
-}
-
-static const struct rtc_class_ops generic_rtc_ops = {
-       .read_time = generic_get_time,
-       .set_time = generic_set_time,
-};
-#else
-#define generic_rtc_ops *(struct rtc_class_ops*)NULL
-#endif
-
  static int __init generic_rtc_probe(struct platform_device *dev)
  {
         struct rtc_device *rtc;
-       const struct rtc_class_ops *ops;
-
-       ops = dev_get_platdata(&dev->dev);
-       if (!ops)
-               ops = &generic_rtc_ops;
+       const struct rtc_class_ops *ops = dev_get_platdata(&dev->dev);
  
         rtc = devm_rtc_device_register(&dev->dev, "rtc-generic",
                                         ops, THIS_MODULE);
diff --git a/drivers/rtc/rtc-hym8563.c b/drivers/rtc/rtc-hym8563.c

index 207270376b552f6013f8cd86022727a0fc07058c..e5ad527cb75e369945a7a533105d6d0d8573ac10 100644 (file)
--- a/drivers/rtc/rtc-hym8563.c
+++ b/drivers/rtc/rtc-hym8563.c
@@ -198,7 +198,7 @@ static int hym8563_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
                 return ret;
  
         /* The alarm only has a minute accuracy */
-       alm_tm->tm_sec = -1;
+       alm_tm->tm_sec = 0;
  
         alm_tm->tm_min = (buf[0] & HYM8563_ALM_BIT_DISABLE) ?
                                         -1 :
@@ -213,9 +213,6 @@ static int hym8563_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
                                         -1 :
                                         bcd2bin(buf[3] & HYM8563_WEEKDAY_MASK);
  
-       alm_tm->tm_mon = -1;
-       alm_tm->tm_year = -1;
-
         ret = i2c_smbus_read_byte_data(client, HYM8563_CTL2);
         if (ret < 0)
                 return ret;
diff --git a/drivers/rtc/rtc-isl12057.c b/drivers/rtc/rtc-isl12057.c

index 54328d4ac0d31061a4afd76d5651279ef526449b..0e7f0f52bfe4e77a89600255997fb0d2aa7ff196 100644 (file)
--- a/drivers/rtc/rtc-isl12057.c
+++ b/drivers/rtc/rtc-isl12057.c
@@ -245,8 +245,7 @@ static int isl12057_rtc_update_alarm(struct device *dev, int enable)
  static int isl12057_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm)
  {
         struct isl12057_rtc_data *data = dev_get_drvdata(dev);
-       struct rtc_time rtc_tm, *alarm_tm = &alarm->time;
-       unsigned long rtc_secs, alarm_secs;
+       struct rtc_time *alarm_tm = &alarm->time;
         u8 regs[ISL12057_A1_SEC_LEN];
         unsigned int ir;
         int ret;
@@ -264,36 +263,6 @@ static int isl12057_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm)
         alarm_tm->tm_min  = bcd2bin(regs[1] & 0x7f);
         alarm_tm->tm_hour = bcd2bin(regs[2] & 0x3f);
         alarm_tm->tm_mday = bcd2bin(regs[3] & 0x3f);
-       alarm_tm->tm_wday = -1;
-
-       /*
-        * The alarm section does not store year/month. We use the ones in rtc
-        * section as a basis and increment month and then year if needed to get
-        * alarm after current time.
-        */
-       ret = _isl12057_rtc_read_time(dev, &rtc_tm);
-       if (ret)
-               goto err_unlock;
-
-       alarm_tm->tm_year = rtc_tm.tm_year;
-       alarm_tm->tm_mon = rtc_tm.tm_mon;
-
-       ret = rtc_tm_to_time(&rtc_tm, &rtc_secs);
-       if (ret)
-               goto err_unlock;
-
-       ret = rtc_tm_to_time(alarm_tm, &alarm_secs);
-       if (ret)
-               goto err_unlock;
-
-       if (alarm_secs < rtc_secs) {
-               if (alarm_tm->tm_mon == 11) {
-                       alarm_tm->tm_mon = 0;
-                       alarm_tm->tm_year += 1;
-               } else {
-                       alarm_tm->tm_mon += 1;
-               }
-       }
  
         ret = regmap_read(data->regmap, ISL12057_REG_INT, &ir);
         if (ret) {
diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c

index d1bf93a8720027e529b6e51b8dbab304c4155c58..58698d21c2c3d37878deedf442a34331659a5c7a 100644 (file)
--- a/drivers/rtc/rtc-m41t80.c
+++ b/drivers/rtc/rtc-m41t80.c
@@ -244,7 +244,7 @@ static int m41t80_alarm_irq_enable(struct device *dev, unsigned int enabled)
  
         retval = i2c_smbus_write_byte_data(client, M41T80_REG_ALARM_MON, flags);
         if (retval < 0) {
-               dev_info(dev, "Unable to enable alarm IRQ %d\n", retval);
+               dev_err(dev, "Unable to enable alarm IRQ %d\n", retval);
                 return retval;
         }
         return 0;
@@ -320,10 +320,8 @@ static int m41t80_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
         alrm->time.tm_sec  = bcd2bin(alarmvals[4] & 0x7f);
         alrm->time.tm_min  = bcd2bin(alarmvals[3] & 0x7f);
         alrm->time.tm_hour = bcd2bin(alarmvals[2] & 0x3f);
-       alrm->time.tm_wday = -1;
         alrm->time.tm_mday = bcd2bin(alarmvals[1] & 0x3f);
         alrm->time.tm_mon  = bcd2bin(alarmvals[0] & 0x3f);
-       alrm->time.tm_year = -1;
  
         alrm->enabled = !!(alarmvals[0] & M41T80_ALMON_AFE);
         alrm->pending = (flags & M41T80_FLAGS_AF) && alrm->enabled;
@@ -337,6 +335,30 @@ static struct rtc_class_ops m41t80_rtc_ops = {
         .proc = m41t80_rtc_proc,
  };
  
+#ifdef CONFIG_PM_SLEEP
+static int m41t80_suspend(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+
+       if (client->irq >= 0 && device_may_wakeup(dev))
+               enable_irq_wake(client->irq);
+
+       return 0;
+}
+
+static int m41t80_resume(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+
+       if (client->irq >= 0 && device_may_wakeup(dev))
+               disable_irq_wake(client->irq);
+
+       return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(m41t80_pm, m41t80_suspend, m41t80_resume);
+
  static ssize_t flags_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
  {
@@ -831,10 +853,9 @@ static int m41t80_probe(struct i2c_client *client,
                 return rc;
         }
  
-       rc = devm_add_action(&client->dev, m41t80_remove_sysfs_group,
-                            &client->dev);
+       rc = devm_add_action_or_reset(&client->dev, m41t80_remove_sysfs_group,
+                                     &client->dev);
         if (rc) {
-               m41t80_remove_sysfs_group(&client->dev);
                 dev_err(&client->dev,
                         "Failed to add sysfs cleanup action: %d\n", rc);
                 return rc;
@@ -873,6 +894,7 @@ static int m41t80_remove(struct i2c_client *client)
  static struct i2c_driver m41t80_driver = {
         .driver = {
                 .name = "rtc-m41t80",
+               .pm = &m41t80_pm,
         },
         .probe = m41t80_probe,
         .remove = m41t80_remove,
diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c

index f72b91f2501f12e7a217c9ecf312204f859591f6..0eeb5714c00fa610f7e438ac084ee16e09a89ae3 100644 (file)
--- a/drivers/rtc/rtc-m48t86.c
+++ b/drivers/rtc/rtc-m48t86.c
@@ -16,7 +16,7 @@
  #include <linux/module.h>
  #include <linux/rtc.h>
  #include <linux/platform_device.h>
-#include <linux/m48t86.h>
+#include <linux/platform_data/rtc-m48t86.h>
  #include <linux/bcd.h>
  
  #define M48T86_REG_SEC         0x00
diff --git a/drivers/rtc/rtc-max6916.c b/drivers/rtc/rtc-max6916.c

new file mode 100644 (file)

index 0000000..623ab27
--- /dev/null
+++ b/drivers/rtc/rtc-max6916.c
@@ -0,0 +1,164 @@
+/* rtc-max6916.c
+ *
+ * Driver for MAXIM  max6916 Low Current, SPI Compatible
+ * Real Time Clock
+ *
+ * Author : Venkat Prashanth B U <venkat.prashanth2498@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+#include <linux/spi/spi.h>
+#include <linux/bcd.h>
+
+/* Registers in max6916 rtc */
+
+#define MAX6916_SECONDS_REG    0x01
+#define MAX6916_MINUTES_REG    0x02
+#define MAX6916_HOURS_REG      0x03
+#define MAX6916_DATE_REG       0x04
+#define MAX6916_MONTH_REG      0x05
+#define MAX6916_DAY_REG        0x06
+#define MAX6916_YEAR_REG       0x07
+#define MAX6916_CONTROL_REG    0x08
+#define MAX6916_STATUS_REG     0x0C
+#define MAX6916_CLOCK_BURST    0x3F
+
+static int max6916_read_reg(struct device *dev, unsigned char address,
+                           unsigned char *data)
+{
+       struct spi_device *spi = to_spi_device(dev);
+
+       *data = address | 0x80;
+
+       return spi_write_then_read(spi, data, 1, data, 1);
+}
+
+static int max6916_write_reg(struct device *dev, unsigned char address,
+                            unsigned char data)
+{
+       struct spi_device *spi = to_spi_device(dev);
+       unsigned char buf[2];
+
+       buf[0] = address & 0x7F;
+       buf[1] = data;
+
+       return spi_write_then_read(spi, buf, 2, NULL, 0);
+}
+
+static int max6916_read_time(struct device *dev, struct rtc_time *dt)
+{
+       struct spi_device *spi = to_spi_device(dev);
+       int err;
+       unsigned char buf[8];
+
+       buf[0] = MAX6916_CLOCK_BURST | 0x80;
+
+       err = spi_write_then_read(spi, buf, 1, buf, 8);
+
+       if (err)
+               return err;
+
+       dt->tm_sec = bcd2bin(buf[0]);
+       dt->tm_min = bcd2bin(buf[1]);
+       dt->tm_hour = bcd2bin(buf[2] & 0x3F);
+       dt->tm_mday = bcd2bin(buf[3]);
+       dt->tm_mon = bcd2bin(buf[4]) - 1;
+       dt->tm_wday = bcd2bin(buf[5]) - 1;
+       dt->tm_year = bcd2bin(buf[6]) + 100;
+
+       return rtc_valid_tm(dt);
+}
+
+static int max6916_set_time(struct device *dev, struct rtc_time *dt)
+{
+       struct spi_device *spi = to_spi_device(dev);
+       unsigned char buf[9];
+
+       if (dt->tm_year < 100 || dt->tm_year > 199) {
+               dev_err(&spi->dev, "Year must be between 2000 and 2099. It's %d.\n",
+                       dt->tm_year + 1900);
+       return -EINVAL;
+       }
+
+       buf[0] = MAX6916_CLOCK_BURST & 0x7F;
+       buf[1] = bin2bcd(dt->tm_sec);
+       buf[2] = bin2bcd(dt->tm_min);
+       buf[3] = (bin2bcd(dt->tm_hour) & 0X3F);
+       buf[4] = bin2bcd(dt->tm_mday);
+       buf[5] = bin2bcd(dt->tm_mon + 1);
+       buf[6] = bin2bcd(dt->tm_wday + 1);
+       buf[7] = bin2bcd(dt->tm_year % 100);
+       buf[8] = bin2bcd(0x00);
+
+       /* write the rtc settings */
+       return spi_write_then_read(spi, buf, 9, NULL, 0);
+}
+
+static const struct rtc_class_ops max6916_rtc_ops = {
+       .read_time = max6916_read_time,
+       .set_time = max6916_set_time,
+};
+
+static int max6916_probe(struct spi_device *spi)
+{
+       struct rtc_device *rtc;
+       unsigned char data;
+       int res;
+
+       /* spi setup with max6916 in mode 3 and bits per word as 8 */
+       spi->mode = SPI_MODE_3;
+       spi->bits_per_word = 8;
+       spi_setup(spi);
+
+       /* RTC Settings */
+       res = max6916_read_reg(&spi->dev, MAX6916_SECONDS_REG, &data);
+       if (res)
+               return res;
+
+       /* Disable the write protect of rtc */
+       max6916_read_reg(&spi->dev, MAX6916_CONTROL_REG, &data);
+       data = data & ~(1 << 7);
+       max6916_write_reg(&spi->dev, MAX6916_CONTROL_REG, data);
+
+       /*Enable oscillator,disable oscillator stop flag, glitch filter*/
+       max6916_read_reg(&spi->dev, MAX6916_STATUS_REG, &data);
+       data = data & 0x1B;
+       max6916_write_reg(&spi->dev, MAX6916_STATUS_REG, data);
+
+       /* display the settings */
+       max6916_read_reg(&spi->dev, MAX6916_CONTROL_REG, &data);
+       dev_info(&spi->dev, "MAX6916 RTC CTRL Reg = 0x%02x\n", data);
+
+       max6916_read_reg(&spi->dev, MAX6916_STATUS_REG, &data);
+       dev_info(&spi->dev, "MAX6916 RTC Status Reg = 0x%02x\n", data);
+
+       rtc = devm_rtc_device_register(&spi->dev, "max6916",
+                                      &max6916_rtc_ops, THIS_MODULE);
+       if (IS_ERR(rtc))
+               return PTR_ERR(rtc);
+
+       spi_set_drvdata(spi, rtc);
+
+       return 0;
+}
+
+static struct spi_driver max6916_driver = {
+       .driver = {
+               .name = "max6916",
+       },
+       .probe = max6916_probe,
+};
+module_spi_driver(max6916_driver);
+
+MODULE_DESCRIPTION("MAX6916 SPI RTC DRIVER");
+MODULE_AUTHOR("Venkat Prashanth B U <venkat.prashanth2498@gmail.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c

new file mode 100644 (file)

index 0000000..2f1772a
--- /dev/null
+++ b/drivers/rtc/rtc-mc146818-lib.c
@@ -0,0 +1,198 @@
+#include <linux/bcd.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/mc146818rtc.h>
+
+#ifdef CONFIG_ACPI
+#include <linux/acpi.h>
+#endif
+
+/*
+ * Returns true if a clock update is in progress
+ */
+static inline unsigned char mc146818_is_updating(void)
+{
+       unsigned char uip;
+       unsigned long flags;
+
+       spin_lock_irqsave(&rtc_lock, flags);
+       uip = (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP);
+       spin_unlock_irqrestore(&rtc_lock, flags);
+       return uip;
+}
+
+unsigned int mc146818_get_time(struct rtc_time *time)
+{
+       unsigned char ctrl;
+       unsigned long flags;
+       unsigned char century = 0;
+
+#ifdef CONFIG_MACH_DECSTATION
+       unsigned int real_year;
+#endif
+
+       /*
+        * read RTC once any update in progress is done. The update
+        * can take just over 2ms. We wait 20ms. There is no need to
+        * to poll-wait (up to 1s - eeccch) for the falling edge of RTC_UIP.
+        * If you need to know *exactly* when a second has started, enable
+        * periodic update complete interrupts, (via ioctl) and then
+        * immediately read /dev/rtc which will block until you get the IRQ.
+        * Once the read clears, read the RTC time (again via ioctl). Easy.
+        */
+       if (mc146818_is_updating())
+               mdelay(20);
+
+       /*
+        * Only the values that we read from the RTC are set. We leave
+        * tm_wday, tm_yday and tm_isdst untouched. Even though the
+        * RTC has RTC_DAY_OF_WEEK, we ignore it, as it is only updated
+        * by the RTC when initially set to a non-zero value.
+        */
+       spin_lock_irqsave(&rtc_lock, flags);
+       time->tm_sec = CMOS_READ(RTC_SECONDS);
+       time->tm_min = CMOS_READ(RTC_MINUTES);
+       time->tm_hour = CMOS_READ(RTC_HOURS);
+       time->tm_mday = CMOS_READ(RTC_DAY_OF_MONTH);
+       time->tm_mon = CMOS_READ(RTC_MONTH);
+       time->tm_year = CMOS_READ(RTC_YEAR);
+#ifdef CONFIG_MACH_DECSTATION
+       real_year = CMOS_READ(RTC_DEC_YEAR);
+#endif
+#ifdef CONFIG_ACPI
+       if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+           acpi_gbl_FADT.century)
+               century = CMOS_READ(acpi_gbl_FADT.century);
+#endif
+       ctrl = CMOS_READ(RTC_CONTROL);
+       spin_unlock_irqrestore(&rtc_lock, flags);
+
+       if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
+       {
+               time->tm_sec = bcd2bin(time->tm_sec);
+               time->tm_min = bcd2bin(time->tm_min);
+               time->tm_hour = bcd2bin(time->tm_hour);
+               time->tm_mday = bcd2bin(time->tm_mday);
+               time->tm_mon = bcd2bin(time->tm_mon);
+               time->tm_year = bcd2bin(time->tm_year);
+               century = bcd2bin(century);
+       }
+
+#ifdef CONFIG_MACH_DECSTATION
+       time->tm_year += real_year - 72;
+#endif
+
+       if (century)
+               time->tm_year += (century - 19) * 100;
+
+       /*
+        * Account for differences between how the RTC uses the values
+        * and how they are defined in a struct rtc_time;
+        */
+       if (time->tm_year <= 69)
+               time->tm_year += 100;
+
+       time->tm_mon--;
+
+       return RTC_24H;
+}
+EXPORT_SYMBOL_GPL(mc146818_get_time);
+
+/* Set the current date and time in the real time clock. */
+int mc146818_set_time(struct rtc_time *time)
+{
+       unsigned long flags;
+       unsigned char mon, day, hrs, min, sec;
+       unsigned char save_control, save_freq_select;
+       unsigned int yrs;
+#ifdef CONFIG_MACH_DECSTATION
+       unsigned int real_yrs, leap_yr;
+#endif
+       unsigned char century = 0;
+
+       yrs = time->tm_year;
+       mon = time->tm_mon + 1;   /* tm_mon starts at zero */
+       day = time->tm_mday;
+       hrs = time->tm_hour;
+       min = time->tm_min;
+       sec = time->tm_sec;
+
+       if (yrs > 255)  /* They are unsigned */
+               return -EINVAL;
+
+       spin_lock_irqsave(&rtc_lock, flags);
+#ifdef CONFIG_MACH_DECSTATION
+       real_yrs = yrs;
+       leap_yr = ((!((yrs + 1900) % 4) && ((yrs + 1900) % 100)) ||
+                       !((yrs + 1900) % 400));
+       yrs = 72;
+
+       /*
+        * We want to keep the year set to 73 until March
+        * for non-leap years, so that Feb, 29th is handled
+        * correctly.
+        */
+       if (!leap_yr && mon < 3) {
+               real_yrs--;
+               yrs = 73;
+       }
+#endif
+
+#ifdef CONFIG_ACPI
+       if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+           acpi_gbl_FADT.century) {
+               century = (yrs + 1900) / 100;
+               yrs %= 100;
+       }
+#endif
+
+       /* These limits and adjustments are independent of
+        * whether the chip is in binary mode or not.
+        */
+       if (yrs > 169) {
+               spin_unlock_irqrestore(&rtc_lock, flags);
+               return -EINVAL;
+       }
+
+       if (yrs >= 100)
+               yrs -= 100;
+
+       if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)
+           || RTC_ALWAYS_BCD) {
+               sec = bin2bcd(sec);
+               min = bin2bcd(min);
+               hrs = bin2bcd(hrs);
+               day = bin2bcd(day);
+               mon = bin2bcd(mon);
+               yrs = bin2bcd(yrs);
+               century = bin2bcd(century);
+       }
+
+       save_control = CMOS_READ(RTC_CONTROL);
+       CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
+       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+       CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
+
+#ifdef CONFIG_MACH_DECSTATION
+       CMOS_WRITE(real_yrs, RTC_DEC_YEAR);
+#endif
+       CMOS_WRITE(yrs, RTC_YEAR);
+       CMOS_WRITE(mon, RTC_MONTH);
+       CMOS_WRITE(day, RTC_DAY_OF_MONTH);
+       CMOS_WRITE(hrs, RTC_HOURS);
+       CMOS_WRITE(min, RTC_MINUTES);
+       CMOS_WRITE(sec, RTC_SECONDS);
+#ifdef CONFIG_ACPI
+       if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+           acpi_gbl_FADT.century)
+               CMOS_WRITE(century, acpi_gbl_FADT.century);
+#endif
+
+       CMOS_WRITE(save_control, RTC_CONTROL);
+       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+
+       spin_unlock_irqrestore(&rtc_lock, flags);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mc146818_set_time);
diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c

index 0094d9bdd1e6a7fba10530f57582ecd6cd026352..7334c44fa7c3553d61db84570e2a78733d2e24e6 100644 (file)
--- a/drivers/rtc/rtc-mrst.c
+++ b/drivers/rtc/rtc-mrst.c
@@ -32,11 +32,11 @@
  #include <linux/interrupt.h>
  #include <linux/spinlock.h>
  #include <linux/kernel.h>
+#include <linux/mc146818rtc.h>
  #include <linux/module.h>
  #include <linux/init.h>
  #include <linux/sfi.h>
  
-#include <asm-generic/rtc.h>
  #include <asm/intel_scu_ipc.h>
  #include <asm/intel-mid.h>
  #include <asm/intel_mid_vrtc.h>
@@ -149,14 +149,6 @@ static int mrst_read_alarm(struct device *dev, struct rtc_wkalrm *t)
         if (mrst->irq <= 0)
                 return -EIO;
  
-       /* Basic alarms only support hour, minute, and seconds fields.
-        * Some also support day and month, for alarms up to a year in
-        * the future.
-        */
-       t->time.tm_mday = -1;
-       t->time.tm_mon = -1;
-       t->time.tm_year = -1;
-
         /* vRTC only supports binary mode */
         spin_lock_irq(&rtc_lock);
         t->time.tm_sec = vrtc_cmos_read(RTC_SECONDS_ALARM);
diff --git a/drivers/rtc/rtc-pcf2123.c b/drivers/rtc/rtc-pcf2123.c

index f22e060709e547528b29b747f796b45c445d23fa..b4478cc92b55dbd76731ea8ec6a4228677adcebb 100644 (file)
--- a/drivers/rtc/rtc-pcf2123.c
+++ b/drivers/rtc/rtc-pcf2123.c
@@ -96,7 +96,7 @@
  #define CD_TMR_TE              BIT(3)  /* Countdown timer enable */
  
  /* PCF2123_REG_OFFSET BITS */
-#define OFFSET_SIGN_BIT                BIT(6)  /* 2's complement sign bit */
+#define OFFSET_SIGN_BIT                6       /* 2's complement sign bit */
  #define OFFSET_COARSE          BIT(7)  /* Coarse mode offset */
  #define OFFSET_STEP            (2170)  /* Offset step in parts per billion */
  
@@ -217,7 +217,7 @@ static int pcf2123_read_offset(struct device *dev, long *offset)
         if (reg & OFFSET_COARSE)
                 reg <<= 1; /* multiply by 2 and sign extend */
         else
-               reg |= (reg & OFFSET_SIGN_BIT) << 1; /* sign extend only */
+               reg = sign_extend32(reg, OFFSET_SIGN_BIT);
  
         *offset = ((long)reg) * OFFSET_STEP;
  
diff --git a/drivers/rtc/rtc-pcf85063.c b/drivers/rtc/rtc-pcf85063.c

index e8ddbb359d11951f0d9592895a23699941280412..efb0a08ac1175182a07ef9d63a49184a20698bd4 100644 (file)
--- a/drivers/rtc/rtc-pcf85063.c
+++ b/drivers/rtc/rtc-pcf85063.c
@@ -16,6 +16,16 @@
  #include <linux/rtc.h>
  #include <linux/module.h>
  
+/*
+ * Information for this driver was pulled from the following datasheets.
+ *
+ *  http://www.nxp.com/documents/data_sheet/PCF85063A.pdf
+ *  http://www.nxp.com/documents/data_sheet/PCF85063TP.pdf
+ *
+ *  PCF85063A -- Rev. 6 — 18 November 2015
+ *  PCF85063TP -- Rev. 4 — 6 May 2015
+*/
+
  #define PCF85063_REG_CTRL1             0x00 /* status */
  #define PCF85063_REG_CTRL1_STOP                BIT(5)
  #define PCF85063_REG_CTRL2             0x01
@@ -55,10 +65,22 @@ static int pcf85063_stop_clock(struct i2c_client *client, u8 *ctrl1)
         return 0;
  }
  
-/*
- * In the routines that deal directly with the pcf85063 hardware, we use
- * rtc_time -- month 0-11, hour 0-23, yr = calendar year-epoch.
- */
+static int pcf85063_start_clock(struct i2c_client *client, u8 ctrl1)
+{
+       s32 ret;
+
+       /* start the clock */
+       ctrl1 &= PCF85063_REG_CTRL1_STOP;
+
+       ret = i2c_smbus_write_byte_data(client, PCF85063_REG_CTRL1, ctrl1);
+       if (ret < 0) {
+               dev_err(&client->dev, "Failing to start the clock\n");
+               return -EIO;
+       }
+
+       return 0;
+}
+
  static int pcf85063_get_datetime(struct i2c_client *client, struct rtc_time *tm)
  {
         int rc;
@@ -90,8 +112,7 @@ static int pcf85063_get_datetime(struct i2c_client *client, struct rtc_time *tm)
         tm->tm_wday = regs[4] & 0x07;
         tm->tm_mon = bcd2bin(regs[5] & 0x1F) - 1; /* rtc mn 1-12 */
         tm->tm_year = bcd2bin(regs[6]);
-       if (tm->tm_year < 70)
-               tm->tm_year += 100;     /* assume we are in 1970...2069 */
+       tm->tm_year += 100;
  
         return rtc_valid_tm(tm);
  }
@@ -99,13 +120,17 @@ static int pcf85063_get_datetime(struct i2c_client *client, struct rtc_time *tm)
  static int pcf85063_set_datetime(struct i2c_client *client, struct rtc_time *tm)
  {
         int rc;
-       u8 regs[8];
+       u8 regs[7];
+       u8 ctrl1;
+
+       if ((tm->tm_year < 100) || (tm->tm_year > 199))
+               return -EINVAL;
  
         /*
          * to accurately set the time, reset the divider chain and keep it in
          * reset state until all time/date registers are written
          */
-       rc = pcf85063_stop_clock(client, &regs[7]);
+       rc = pcf85063_stop_clock(client, &ctrl1);
         if (rc != 0)
                 return rc;
  
@@ -125,14 +150,7 @@ static int pcf85063_set_datetime(struct i2c_client *client, struct rtc_time *tm)
         regs[5] = bin2bcd(tm->tm_mon + 1);
  
         /* year and century */
-       regs[6] = bin2bcd(tm->tm_year % 100);
-
-       /*
-        * after all time/date registers are written, let the 'address auto
-        * increment' feature wrap around and write register CTRL1 to re-enable
-        * the clock divider chain again
-        */
-       regs[7] &= ~PCF85063_REG_CTRL1_STOP;
+       regs[6] = bin2bcd(tm->tm_year - 100);
  
         /* write all registers at once */
         rc = i2c_smbus_write_i2c_block_data(client, PCF85063_REG_SC,
@@ -142,6 +160,15 @@ static int pcf85063_set_datetime(struct i2c_client *client, struct rtc_time *tm)
                 return rc;
         }
  
+       /*
+        * Write the control register as a separate action since the size of
+        * the register space is different between the PCF85063TP and
+        * PCF85063A devices.  The rollover point can not be used.
+        */
+       rc = pcf85063_start_clock(client, ctrl1);
+       if (rc != 0)
+               return rc;
+
         return 0;
  }
  
diff --git a/drivers/rtc/rtc-pcf8563.c b/drivers/rtc/rtc-pcf8563.c

index b9ddbb001283e65335026932fdd03af3e55f81b2..1227ceab61eecc408526c6eb542477058a46417a 100644 (file)
--- a/drivers/rtc/rtc-pcf8563.c
+++ b/drivers/rtc/rtc-pcf8563.c
@@ -341,14 +341,11 @@ static int pcf8563_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *tm)
                 "%s: raw data is min=%02x, hr=%02x, mday=%02x, wday=%02x\n",
                 __func__, buf[0], buf[1], buf[2], buf[3]);
  
+       tm->time.tm_sec = 0;
         tm->time.tm_min = bcd2bin(buf[0] & 0x7F);
         tm->time.tm_hour = bcd2bin(buf[1] & 0x3F);
         tm->time.tm_mday = bcd2bin(buf[2] & 0x3F);
         tm->time.tm_wday = bcd2bin(buf[3] & 0x7);
-       tm->time.tm_mon = -1;
-       tm->time.tm_year = -1;
-       tm->time.tm_yday = -1;
-       tm->time.tm_isdst = -1;
  
         err = pcf8563_get_alarm_mode(client, &tm->enabled, &tm->pending);
         if (err < 0)
diff --git a/drivers/rtc/rtc-rc5t583.c b/drivers/rtc/rtc-rc5t583.c

index f28d577889511550b641719d8a8f8dd2edc75d1e..68ce77414bdc6c671a9e1e631d7935b017c9cf39 100644 (file)
--- a/drivers/rtc/rtc-rc5t583.c
+++ b/drivers/rtc/rtc-rc5t583.c
@@ -128,6 +128,7 @@ static int rc5t583_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
                 return ret;
         }
  
+       alm->time.tm_sec = 0;
         alm->time.tm_min = bcd2bin(alarm_data[0]);
         alm->time.tm_hour = bcd2bin(alarm_data[1]);
         alm->time.tm_mday = bcd2bin(alarm_data[2]);
diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c

index ef86229428fc749b6a84a8993ff6fe50a2a92e3f..c8c7574667837522ae4863f743ba978229116fc5 100644 (file)
--- a/drivers/rtc/rtc-rs5c372.c
+++ b/drivers/rtc/rtc-rs5c372.c
@@ -341,12 +341,6 @@ static int rs5c_read_alarm(struct device *dev, struct rtc_wkalrm *t)
         t->time.tm_sec = 0;
         t->time.tm_min = bcd2bin(rs5c->regs[RS5C_REG_ALARM_A_MIN] & 0x7f);
         t->time.tm_hour = rs5c_reg2hr(rs5c, rs5c->regs[RS5C_REG_ALARM_A_HOURS]);
-       t->time.tm_mday = -1;
-       t->time.tm_mon = -1;
-       t->time.tm_year = -1;
-       t->time.tm_wday = -1;
-       t->time.tm_yday = -1;
-       t->time.tm_isdst = -1;
  
         /* ... and status */
         t->enabled = !!(rs5c->regs[RS5C_REG_CTRL1] & RS5C_CTRL1_AALE);
diff --git a/drivers/rtc/rtc-rv8803.c b/drivers/rtc/rtc-rv8803.c

index f623038e586ecf270989a8c0bb0246d2ff279925..9a2f6a95d5a7cf5757308c3ce6668e2f78d20c35 100644 (file)
--- a/drivers/rtc/rtc-rv8803.c
+++ b/drivers/rtc/rtc-rv8803.c
@@ -13,12 +13,15 @@
  
  #include <linux/bcd.h>
  #include <linux/bitops.h>
+#include <linux/log2.h>
  #include <linux/i2c.h>
  #include <linux/interrupt.h>
  #include <linux/kernel.h>
  #include <linux/module.h>
  #include <linux/rtc.h>
  
+#define RV8803_I2C_TRY_COUNT           4
+
  #define RV8803_SEC                     0x00
  #define RV8803_MIN                     0x01
  #define RV8803_HOUR                    0x02
@@ -56,19 +59,85 @@ struct rv8803_data {
         u8 ctrl;
  };
  
+static int rv8803_read_reg(const struct i2c_client *client, u8 reg)
+{
+       int try = RV8803_I2C_TRY_COUNT;
+       s32 ret;
+
+       /*
+        * There is a 61µs window during which the RTC does not acknowledge I2C
+        * transfers. In that case, ensure that there are multiple attempts.
+        */
+       do
+               ret = i2c_smbus_read_byte_data(client, reg);
+       while ((ret == -ENXIO || ret == -EIO) && --try);
+       if (ret < 0)
+               dev_err(&client->dev, "Unable to read register 0x%02x\n", reg);
+
+       return ret;
+}
+
+static int rv8803_read_regs(const struct i2c_client *client,
+                           u8 reg, u8 count, u8 *values)
+{
+       int try = RV8803_I2C_TRY_COUNT;
+       s32 ret;
+
+       do
+               ret = i2c_smbus_read_i2c_block_data(client, reg, count, values);
+       while ((ret == -ENXIO || ret == -EIO) && --try);
+       if (ret != count) {
+               dev_err(&client->dev,
+                       "Unable to read registers 0x%02x..0x%02x\n",
+                       reg, reg + count - 1);
+               return ret < 0 ? ret : -EIO;
+       }
+
+       return 0;
+}
+
+static int rv8803_write_reg(const struct i2c_client *client, u8 reg, u8 value)
+{
+       int try = RV8803_I2C_TRY_COUNT;
+       s32 ret;
+
+       do
+               ret = i2c_smbus_write_byte_data(client, reg, value);
+       while ((ret == -ENXIO || ret == -EIO) && --try);
+       if (ret)
+               dev_err(&client->dev, "Unable to write register 0x%02x\n", reg);
+
+       return ret;
+}
+
+static int rv8803_write_regs(const struct i2c_client *client,
+                            u8 reg, u8 count, const u8 *values)
+{
+       int try = RV8803_I2C_TRY_COUNT;
+       s32 ret;
+
+       do
+               ret = i2c_smbus_write_i2c_block_data(client, reg, count,
+                                                    values);
+       while ((ret == -ENXIO || ret == -EIO) && --try);
+       if (ret)
+               dev_err(&client->dev,
+                       "Unable to write registers 0x%02x..0x%02x\n",
+                       reg, reg + count - 1);
+
+       return ret;
+}
+
  static irqreturn_t rv8803_handle_irq(int irq, void *dev_id)
  {
         struct i2c_client *client = dev_id;
         struct rv8803_data *rv8803 = i2c_get_clientdata(client);
         unsigned long events = 0;
-       int flags, try = 0;
+       int flags;
  
         mutex_lock(&rv8803->flags_lock);
  
-       do {
-               flags = i2c_smbus_read_byte_data(client, RV8803_FLAG);
-               try++;
-       } while ((flags == -ENXIO) && (try < 3));
+       flags = rv8803_read_reg(client, RV8803_FLAG);
         if (flags <= 0) {
                 mutex_unlock(&rv8803->flags_lock);
                 return IRQ_NONE;
@@ -100,9 +169,8 @@ static irqreturn_t rv8803_handle_irq(int irq, void *dev_id)
  
         if (events) {
                 rtc_update_irq(rv8803->rtc, 1, events);
-               i2c_smbus_write_byte_data(client, RV8803_FLAG, flags);
-               i2c_smbus_write_byte_data(rv8803->client, RV8803_CTRL,
-                                         rv8803->ctrl);
+               rv8803_write_reg(client, RV8803_FLAG, flags);
+               rv8803_write_reg(rv8803->client, RV8803_CTRL, rv8803->ctrl);
         }
  
         mutex_unlock(&rv8803->flags_lock);
@@ -118,7 +186,7 @@ static int rv8803_get_time(struct device *dev, struct rtc_time *tm)
         u8 *date = date1;
         int ret, flags;
  
-       flags = i2c_smbus_read_byte_data(rv8803->client, RV8803_FLAG);
+       flags = rv8803_read_reg(rv8803->client, RV8803_FLAG);
         if (flags < 0)
                 return flags;
  
@@ -127,16 +195,14 @@ static int rv8803_get_time(struct device *dev, struct rtc_time *tm)
                 return -EINVAL;
         }
  
-       ret = i2c_smbus_read_i2c_block_data(rv8803->client, RV8803_SEC,
-                                           7, date);
-       if (ret != 7)
-               return ret < 0 ? ret : -EIO;
+       ret = rv8803_read_regs(rv8803->client, RV8803_SEC, 7, date);
+       if (ret)
+               return ret;
  
         if ((date1[RV8803_SEC] & 0x7f) == bin2bcd(59)) {
-               ret = i2c_smbus_read_i2c_block_data(rv8803->client, RV8803_SEC,
-                                                   7, date2);
-               if (ret != 7)
-                       return ret < 0 ? ret : -EIO;
+               ret = rv8803_read_regs(rv8803->client, RV8803_SEC, 7, date2);
+               if (ret)
+                       return ret;
  
                 if ((date2[RV8803_SEC] & 0x7f) != bin2bcd(59))
                         date = date2;
@@ -145,23 +211,33 @@ static int rv8803_get_time(struct device *dev, struct rtc_time *tm)
         tm->tm_sec  = bcd2bin(date[RV8803_SEC] & 0x7f);
         tm->tm_min  = bcd2bin(date[RV8803_MIN] & 0x7f);
         tm->tm_hour = bcd2bin(date[RV8803_HOUR] & 0x3f);
-       tm->tm_wday = ffs(date[RV8803_WEEK] & 0x7f);
+       tm->tm_wday = ilog2(date[RV8803_WEEK] & 0x7f);
         tm->tm_mday = bcd2bin(date[RV8803_DAY] & 0x3f);
         tm->tm_mon  = bcd2bin(date[RV8803_MONTH] & 0x1f) - 1;
         tm->tm_year = bcd2bin(date[RV8803_YEAR]) + 100;
  
-       return rtc_valid_tm(tm);
+       return 0;
  }
  
  static int rv8803_set_time(struct device *dev, struct rtc_time *tm)
  {
         struct rv8803_data *rv8803 = dev_get_drvdata(dev);
         u8 date[7];
-       int flags, ret;
+       int ctrl, flags, ret;
  
         if ((tm->tm_year < 100) || (tm->tm_year > 199))
                 return -EINVAL;
  
+       ctrl = rv8803_read_reg(rv8803->client, RV8803_CTRL);
+       if (ctrl < 0)
+               return ctrl;
+
+       /* Stop the clock */
+       ret = rv8803_write_reg(rv8803->client, RV8803_CTRL,
+                              ctrl | RV8803_CTRL_RESET);
+       if (ret)
+               return ret;
+
         date[RV8803_SEC]   = bin2bcd(tm->tm_sec);
         date[RV8803_MIN]   = bin2bcd(tm->tm_min);
         date[RV8803_HOUR]  = bin2bcd(tm->tm_hour);
@@ -170,21 +246,26 @@ static int rv8803_set_time(struct device *dev, struct rtc_time *tm)
         date[RV8803_MONTH] = bin2bcd(tm->tm_mon + 1);
         date[RV8803_YEAR]  = bin2bcd(tm->tm_year - 100);
  
-       ret = i2c_smbus_write_i2c_block_data(rv8803->client, RV8803_SEC,
-                                            7, date);
-       if (ret < 0)
+       ret = rv8803_write_regs(rv8803->client, RV8803_SEC, 7, date);
+       if (ret)
+               return ret;
+
+       /* Restart the clock */
+       ret = rv8803_write_reg(rv8803->client, RV8803_CTRL,
+                              ctrl & ~RV8803_CTRL_RESET);
+       if (ret)
                 return ret;
  
         mutex_lock(&rv8803->flags_lock);
  
-       flags = i2c_smbus_read_byte_data(rv8803->client, RV8803_FLAG);
+       flags = rv8803_read_reg(rv8803->client, RV8803_FLAG);
         if (flags < 0) {
                 mutex_unlock(&rv8803->flags_lock);
                 return flags;
         }
  
-       ret = i2c_smbus_write_byte_data(rv8803->client, RV8803_FLAG,
-                                       flags & ~RV8803_FLAG_V2F);
+       ret = rv8803_write_reg(rv8803->client, RV8803_FLAG,
+                              flags & ~(RV8803_FLAG_V1F | RV8803_FLAG_V2F));
  
         mutex_unlock(&rv8803->flags_lock);
  
@@ -198,22 +279,18 @@ static int rv8803_get_alarm(struct device *dev, struct rtc_wkalrm *alrm)
         u8 alarmvals[3];
         int flags, ret;
  
-       ret = i2c_smbus_read_i2c_block_data(client, RV8803_ALARM_MIN,
-                                           3, alarmvals);
-       if (ret != 3)
-               return ret < 0 ? ret : -EIO;
+       ret = rv8803_read_regs(client, RV8803_ALARM_MIN, 3, alarmvals);
+       if (ret)
+               return ret;
  
-       flags = i2c_smbus_read_byte_data(client, RV8803_FLAG);
+       flags = rv8803_read_reg(client, RV8803_FLAG);
         if (flags < 0)
                 return flags;
  
         alrm->time.tm_sec  = 0;
         alrm->time.tm_min  = bcd2bin(alarmvals[0] & 0x7f);
         alrm->time.tm_hour = bcd2bin(alarmvals[1] & 0x3f);
-       alrm->time.tm_wday = -1;
         alrm->time.tm_mday = bcd2bin(alarmvals[2] & 0x3f);
-       alrm->time.tm_mon  = -1;
-       alrm->time.tm_year = -1;
  
         alrm->enabled = !!(rv8803->ctrl & RV8803_CTRL_AIE);
         alrm->pending = (flags & RV8803_FLAG_AF) && alrm->enabled;
@@ -239,10 +316,10 @@ static int rv8803_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
  
         mutex_lock(&rv8803->flags_lock);
  
-       ret = i2c_smbus_read_i2c_block_data(client, RV8803_FLAG, 2, ctrl);
-       if (ret != 2) {
+       ret = rv8803_read_regs(client, RV8803_FLAG, 2, ctrl);
+       if (ret) {
                 mutex_unlock(&rv8803->flags_lock);
-               return ret < 0 ? ret : -EIO;
+               return ret;
         }
  
         alarmvals[0] = bin2bcd(alrm->time.tm_min);
@@ -251,8 +328,8 @@ static int rv8803_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
  
         if (rv8803->ctrl & (RV8803_CTRL_AIE | RV8803_CTRL_UIE)) {
                 rv8803->ctrl &= ~(RV8803_CTRL_AIE | RV8803_CTRL_UIE);
-               err = i2c_smbus_write_byte_data(rv8803->client, RV8803_CTRL,
-                                               rv8803->ctrl);
+               err = rv8803_write_reg(rv8803->client, RV8803_CTRL,
+                                      rv8803->ctrl);
                 if (err) {
                         mutex_unlock(&rv8803->flags_lock);
                         return err;
@@ -260,13 +337,12 @@ static int rv8803_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
         }
  
         ctrl[1] &= ~RV8803_FLAG_AF;
-       err = i2c_smbus_write_byte_data(rv8803->client, RV8803_FLAG, ctrl[1]);
+       err = rv8803_write_reg(rv8803->client, RV8803_FLAG, ctrl[1]);
         mutex_unlock(&rv8803->flags_lock);
         if (err)
                 return err;
  
-       err = i2c_smbus_write_i2c_block_data(rv8803->client, RV8803_ALARM_MIN,
-                                            3, alarmvals);
+       err = rv8803_write_regs(rv8803->client, RV8803_ALARM_MIN, 3, alarmvals);
         if (err)
                 return err;
  
@@ -276,8 +352,8 @@ static int rv8803_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
                 if (rv8803->rtc->aie_timer.enabled)
                         rv8803->ctrl |= RV8803_CTRL_AIE;
  
-               err = i2c_smbus_write_byte_data(rv8803->client, RV8803_CTRL,
-                                               rv8803->ctrl);
+               err = rv8803_write_reg(rv8803->client, RV8803_CTRL,
+                                      rv8803->ctrl);
                 if (err)
                         return err;
         }
@@ -306,21 +382,20 @@ static int rv8803_alarm_irq_enable(struct device *dev, unsigned int enabled)
         }
  
         mutex_lock(&rv8803->flags_lock);
-       flags = i2c_smbus_read_byte_data(client, RV8803_FLAG);
+       flags = rv8803_read_reg(client, RV8803_FLAG);
         if (flags < 0) {
                 mutex_unlock(&rv8803->flags_lock);
                 return flags;
         }
         flags &= ~(RV8803_FLAG_AF | RV8803_FLAG_UF);
-       err = i2c_smbus_write_byte_data(client, RV8803_FLAG, flags);
+       err = rv8803_write_reg(client, RV8803_FLAG, flags);
         mutex_unlock(&rv8803->flags_lock);
         if (err)
                 return err;
  
         if (ctrl != rv8803->ctrl) {
                 rv8803->ctrl = ctrl;
-               err = i2c_smbus_write_byte_data(client, RV8803_CTRL,
-                                               rv8803->ctrl);
+               err = rv8803_write_reg(client, RV8803_CTRL, rv8803->ctrl);
                 if (err)
                         return err;
         }
@@ -336,7 +411,7 @@ static int rv8803_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
  
         switch (cmd) {
         case RTC_VL_READ:
-               flags = i2c_smbus_read_byte_data(client, RV8803_FLAG);
+               flags = rv8803_read_reg(client, RV8803_FLAG);
                 if (flags < 0)
                         return flags;
  
@@ -355,16 +430,16 @@ static int rv8803_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
  
         case RTC_VL_CLR:
                 mutex_lock(&rv8803->flags_lock);
-               flags = i2c_smbus_read_byte_data(client, RV8803_FLAG);
+               flags = rv8803_read_reg(client, RV8803_FLAG);
                 if (flags < 0) {
                         mutex_unlock(&rv8803->flags_lock);
                         return flags;
                 }
  
                 flags &= ~(RV8803_FLAG_V1F | RV8803_FLAG_V2F);
-               ret = i2c_smbus_write_byte_data(client, RV8803_FLAG, flags);
+               ret = rv8803_write_reg(client, RV8803_FLAG, flags);
                 mutex_unlock(&rv8803->flags_lock);
-               if (ret < 0)
+               if (ret)
                         return ret;
  
                 return 0;
@@ -382,8 +457,8 @@ static ssize_t rv8803_nvram_write(struct file *filp, struct kobject *kobj,
         struct i2c_client *client = to_i2c_client(dev);
         int ret;
  
-       ret = i2c_smbus_write_byte_data(client, RV8803_RAM, buf[0]);
-       if (ret < 0)
+       ret = rv8803_write_reg(client, RV8803_RAM, buf[0]);
+       if (ret)
                 return ret;
  
         return 1;
@@ -397,7 +472,7 @@ static ssize_t rv8803_nvram_read(struct file *filp, struct kobject *kobj,
         struct i2c_client *client = to_i2c_client(dev);
         int ret;
  
-       ret = i2c_smbus_read_byte_data(client, RV8803_RAM);
+       ret = rv8803_read_reg(client, RV8803_RAM);
         if (ret < 0)
                 return ret;
  
@@ -427,7 +502,7 @@ static int rv8803_probe(struct i2c_client *client,
  {
         struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
         struct rv8803_data *rv8803;
-       int err, flags, try = 0;
+       int err, flags;
  
         if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA |
                                      I2C_FUNC_SMBUS_I2C_BLOCK)) {
@@ -444,16 +519,7 @@ static int rv8803_probe(struct i2c_client *client,
         rv8803->client = client;
         i2c_set_clientdata(client, rv8803);
  
-       /*
-        * There is a 60µs window where the RTC may not reply on the i2c bus in
-        * that case, the transfer is not ACKed. In that case, ensure there are
-        * multiple attempts.
-        */
-       do {
-               flags = i2c_smbus_read_byte_data(client, RV8803_FLAG);
-               try++;
-       } while ((flags == -ENXIO) && (try < 3));
-
+       flags = rv8803_read_reg(client, RV8803_FLAG);
         if (flags < 0)
                 return flags;
  
@@ -488,12 +554,7 @@ static int rv8803_probe(struct i2c_client *client,
                 return PTR_ERR(rv8803->rtc);
         }
  
-       try = 0;
-       do {
-               err = i2c_smbus_write_byte_data(rv8803->client, RV8803_EXT,
-                                               RV8803_EXT_WADA);
-               try++;
-       } while ((err == -ENXIO) && (try < 3));
+       err = rv8803_write_reg(rv8803->client, RV8803_EXT, RV8803_EXT_WADA);
         if (err)
                 return err;
  
diff --git a/drivers/rtc/rtc-rx8010.c b/drivers/rtc/rtc-rx8010.c

index 772d221ec2d9f16fc879787ff2729c3c731a416b..7163b91bb773bd3a8ef0a3be75d9936ffc116f9b 100644 (file)
--- a/drivers/rtc/rtc-rx8010.c
+++ b/drivers/rtc/rtc-rx8010.c
@@ -272,15 +272,9 @@ static int rx8010_read_alarm(struct device *dev, struct rtc_wkalrm *t)
         t->time.tm_min = bcd2bin(alarmvals[0] & 0x7f);
         t->time.tm_hour = bcd2bin(alarmvals[1] & 0x3f);
  
-       if (alarmvals[2] & RX8010_ALARM_AE)
-               t->time.tm_mday = -1;
-       else
+       if (!(alarmvals[2] & RX8010_ALARM_AE))
                 t->time.tm_mday = bcd2bin(alarmvals[2] & 0x7f);
  
-       t->time.tm_wday = -1;
-       t->time.tm_mon = -1;
-       t->time.tm_year = -1;
-
         t->enabled = !!(rx8010->ctrlreg & RX8010_CTRL_AIE);
         t->pending = (flagreg & RX8010_FLAG_AF) && t->enabled;
  
diff --git a/drivers/rtc/rtc-rx8025.c b/drivers/rtc/rtc-rx8025.c

index 9f105efbc5464815717f1a9bf622e947cabebbe2..2b85cc7a24e752c01d1cc16c60d738f7fbfc5983 100644 (file)
--- a/drivers/rtc/rtc-rx8025.c
+++ b/drivers/rtc/rtc-rx8025.c
@@ -319,11 +319,6 @@ static int rx8025_read_alarm(struct device *dev, struct rtc_wkalrm *t)
                 t->time.tm_hour = bcd2bin(ald[1] & 0x1f) % 12
                         + (ald[1] & 0x20 ? 12 : 0);
  
-       t->time.tm_wday = -1;
-       t->time.tm_mday = -1;
-       t->time.tm_mon = -1;
-       t->time.tm_year = -1;
-
         dev_dbg(dev, "%s: date: %ds %dm %dh %dmd %dm %dy\n",
                 __func__,
                 t->time.tm_sec, t->time.tm_min, t->time.tm_hour,
diff --git a/drivers/rtc/rtc-s35390a.c b/drivers/rtc/rtc-s35390a.c

index f40afdd0e5f5939f418b6527521b7c381ffaa286..5dab4665ca3bd2c488fd4548f15fce3945bb6fad 100644 (file)
--- a/drivers/rtc/rtc-s35390a.c
+++ b/drivers/rtc/rtc-s35390a.c
@@ -15,6 +15,7 @@
  #include <linux/bitrev.h>
  #include <linux/bcd.h>
  #include <linux/slab.h>
+#include <linux/delay.h>
  
  #define S35390A_CMD_STATUS1    0
  #define S35390A_CMD_STATUS2    1
@@ -34,10 +35,14 @@
  #define S35390A_ALRM_BYTE_HOURS        1
  #define S35390A_ALRM_BYTE_MINS 2
  
+/* flags for STATUS1 */
  #define S35390A_FLAG_POC       0x01
  #define S35390A_FLAG_BLD       0x02
+#define S35390A_FLAG_INT2      0x04
  #define S35390A_FLAG_24H       0x40
  #define S35390A_FLAG_RESET     0x80
+
+/* flag for STATUS2 */
  #define S35390A_FLAG_TEST      0x01
  
  #define S35390A_INT2_MODE_MASK         0xF0
@@ -94,19 +99,63 @@ static int s35390a_get_reg(struct s35390a *s35390a, int reg, char *buf, int len)
         return 0;
  }
  
-static int s35390a_reset(struct s35390a *s35390a)
+/*
+ * Returns <0 on error, 0 if rtc is setup fine and 1 if the chip was reset.
+ * To keep the information if an irq is pending, pass the value read from
+ * STATUS1 to the caller.
+ */
+static int s35390a_reset(struct s35390a *s35390a, char *status1)
  {
-       char buf[1];
-
-       if (s35390a_get_reg(s35390a, S35390A_CMD_STATUS1, buf, sizeof(buf)) < 0)
-               return -EIO;
-
-       if (!(buf[0] & (S35390A_FLAG_POC | S35390A_FLAG_BLD)))
+       char buf;
+       int ret;
+       unsigned initcount = 0;
+
+       ret = s35390a_get_reg(s35390a, S35390A_CMD_STATUS1, status1, 1);
+       if (ret < 0)
+               return ret;
+
+       if (*status1 & S35390A_FLAG_POC)
+               /*
+                * Do not communicate for 0.5 seconds since the power-on
+                * detection circuit is in operation.
+                */
+               msleep(500);
+       else if (!(*status1 & S35390A_FLAG_BLD))
+               /*
+                * If both POC and BLD are unset everything is fine.
+                */
                 return 0;
  
-       buf[0] |= (S35390A_FLAG_RESET | S35390A_FLAG_24H);
-       buf[0] &= 0xf0;
-       return s35390a_set_reg(s35390a, S35390A_CMD_STATUS1, buf, sizeof(buf));
+       /*
+        * At least one of POC and BLD are set, so reinitialise chip. Keeping
+        * this information in the hardware to know later that the time isn't
+        * valid is unfortunately not possible because POC and BLD are cleared
+        * on read. So the reset is best done now.
+        *
+        * The 24H bit is kept over reset, so set it already here.
+        */
+initialize:
+       *status1 = S35390A_FLAG_24H;
+       buf = S35390A_FLAG_RESET | S35390A_FLAG_24H;
+       ret = s35390a_set_reg(s35390a, S35390A_CMD_STATUS1, &buf, 1);
+
+       if (ret < 0)
+               return ret;
+
+       ret = s35390a_get_reg(s35390a, S35390A_CMD_STATUS1, &buf, 1);
+       if (ret < 0)
+               return ret;
+
+       if (buf & (S35390A_FLAG_POC | S35390A_FLAG_BLD)) {
+               /* Try up to five times to reset the chip */
+               if (initcount < 5) {
+                       ++initcount;
+                       goto initialize;
+               } else
+                       return -EIO;
+       }
+
+       return 1;
  }
  
  static int s35390a_disable_test_mode(struct s35390a *s35390a)
@@ -217,12 +266,12 @@ static int s35390a_set_alarm(struct i2c_client *client, struct rtc_wkalrm *alm)
                 alm->time.tm_min, alm->time.tm_hour, alm->time.tm_mday,
                 alm->time.tm_mon, alm->time.tm_year, alm->time.tm_wday);
  
-       /* disable interrupt */
+       /* disable interrupt (which deasserts the irq line) */
         err = s35390a_set_reg(s35390a, S35390A_CMD_STATUS2, &sts, sizeof(sts));
         if (err < 0)
                 return err;
  
-       /* clear pending interrupt, if any */
+       /* clear pending interrupt (in STATUS1 only), if any */
         err = s35390a_get_reg(s35390a, S35390A_CMD_STATUS1, &sts, sizeof(sts));
         if (err < 0)
                 return err;
@@ -242,6 +291,8 @@ static int s35390a_set_alarm(struct i2c_client *client, struct rtc_wkalrm *alm)
  
         if (alm->time.tm_wday != -1)
                 buf[S35390A_ALRM_BYTE_WDAY] = bin2bcd(alm->time.tm_wday) | 0x80;
+       else
+               buf[S35390A_ALRM_BYTE_WDAY] = 0;
  
         buf[S35390A_ALRM_BYTE_HOURS] = s35390a_hr2reg(s35390a,
                         alm->time.tm_hour) | 0x80;
@@ -269,23 +320,43 @@ static int s35390a_read_alarm(struct i2c_client *client, struct rtc_wkalrm *alm)
         if (err < 0)
                 return err;
  
-       if (bitrev8(sts) != S35390A_INT2_MODE_ALARM)
-               return -EINVAL;
+       if ((bitrev8(sts) & S35390A_INT2_MODE_MASK) != S35390A_INT2_MODE_ALARM) {
+               /*
+                * When the alarm isn't enabled, the register to configure
+                * the alarm time isn't accessible.
+                */
+               alm->enabled = 0;
+               return 0;
+       } else {
+               alm->enabled = 1;
+       }
  
         err = s35390a_get_reg(s35390a, S35390A_CMD_INT2_REG1, buf, sizeof(buf));
         if (err < 0)
                 return err;
  
         /* This chip returns the bits of each byte in reverse order */
-       for (i = 0; i < 3; ++i) {
+       for (i = 0; i < 3; ++i)
                 buf[i] = bitrev8(buf[i]);
-               buf[i] &= ~0x80;
-       }
  
-       alm->time.tm_wday = bcd2bin(buf[S35390A_ALRM_BYTE_WDAY]);
-       alm->time.tm_hour = s35390a_reg2hr(s35390a,
-                                               buf[S35390A_ALRM_BYTE_HOURS]);
-       alm->time.tm_min = bcd2bin(buf[S35390A_ALRM_BYTE_MINS]);
+       /*
+        * B0 of the three matching registers is an enable flag. Iff it is set
+        * the configured value is used for matching.
+        */
+       if (buf[S35390A_ALRM_BYTE_WDAY] & 0x80)
+               alm->time.tm_wday =
+                       bcd2bin(buf[S35390A_ALRM_BYTE_WDAY] & ~0x80);
+
+       if (buf[S35390A_ALRM_BYTE_HOURS] & 0x80)
+               alm->time.tm_hour =
+                       s35390a_reg2hr(s35390a,
+                                      buf[S35390A_ALRM_BYTE_HOURS] & ~0x80);
+
+       if (buf[S35390A_ALRM_BYTE_MINS] & 0x80)
+               alm->time.tm_min = bcd2bin(buf[S35390A_ALRM_BYTE_MINS] & ~0x80);
+
+       /* alarm triggers always at s=0 */
+       alm->time.tm_sec = 0;
  
         dev_dbg(&client->dev, "%s: alm is mins=%d, hours=%d, wday=%d\n",
                         __func__, alm->time.tm_min, alm->time.tm_hour,
@@ -327,11 +398,11 @@ static struct i2c_driver s35390a_driver;
  static int s35390a_probe(struct i2c_client *client,
                          const struct i2c_device_id *id)
  {
-       int err;
+       int err, err_reset;
         unsigned int i;
         struct s35390a *s35390a;
         struct rtc_time tm;
-       char buf[1];
+       char buf, status1;
  
         if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
                 err = -ENODEV;
@@ -360,29 +431,35 @@ static int s35390a_probe(struct i2c_client *client,
                 }
         }
  
-       err = s35390a_reset(s35390a);
-       if (err < 0) {
+       err_reset = s35390a_reset(s35390a, &status1);
+       if (err_reset < 0) {
+               err = err_reset;
                 dev_err(&client->dev, "error resetting chip\n");
                 goto exit_dummy;
         }
  
-       err = s35390a_disable_test_mode(s35390a);
-       if (err < 0) {
-               dev_err(&client->dev, "error disabling test mode\n");
-               goto exit_dummy;
-       }
-
-       err = s35390a_get_reg(s35390a, S35390A_CMD_STATUS1, buf, sizeof(buf));
-       if (err < 0) {
-               dev_err(&client->dev, "error checking 12/24 hour mode\n");
-               goto exit_dummy;
-       }
-       if (buf[0] & S35390A_FLAG_24H)
+       if (status1 & S35390A_FLAG_24H)
                 s35390a->twentyfourhour = 1;
         else
                 s35390a->twentyfourhour = 0;
  
-       if (s35390a_get_datetime(client, &tm) < 0)
+       if (status1 & S35390A_FLAG_INT2) {
+               /* disable alarm (and maybe test mode) */
+               buf = 0;
+               err = s35390a_set_reg(s35390a, S35390A_CMD_STATUS2, &buf, 1);
+               if (err < 0) {
+                       dev_err(&client->dev, "error disabling alarm");
+                       goto exit_dummy;
+               }
+       } else {
+               err = s35390a_disable_test_mode(s35390a);
+               if (err < 0) {
+                       dev_err(&client->dev, "error disabling test mode\n");
+                       goto exit_dummy;
+               }
+       }
+
+       if (err_reset > 0 || s35390a_get_datetime(client, &tm) < 0)
                 dev_warn(&client->dev, "clock needs to be set\n");
  
         device_set_wakeup_capable(&client->dev, 1);
@@ -395,6 +472,10 @@ static int s35390a_probe(struct i2c_client *client,
                 err = PTR_ERR(s35390a->rtc);
                 goto exit_dummy;
         }
+
+       if (status1 & S35390A_FLAG_INT2)
+               rtc_update_irq(s35390a->rtc, 1, RTC_AF);
+
         return 0;
  
  exit_dummy:
diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c

index d01ad7e8078ed6e44c61fbaa8e4875a96d8cce25..d44fb34df8fe894b41f1a8c107e699f752ac5a00 100644 (file)
--- a/drivers/rtc/rtc-s3c.c
+++ b/drivers/rtc/rtc-s3c.c
@@ -149,12 +149,14 @@ static int s3c_rtc_setfreq(struct s3c_rtc *info, int freq)
         if (!is_power_of_2(freq))
                 return -EINVAL;
  
+       s3c_rtc_enable_clk(info);
         spin_lock_irq(&info->pie_lock);
  
         if (info->data->set_freq)
                 info->data->set_freq(info, freq);
  
         spin_unlock_irq(&info->pie_lock);
+       s3c_rtc_disable_clk(info);
  
         return 0;
  }
@@ -264,35 +266,23 @@ static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm)
         /* decode the alarm enable field */
         if (alm_en & S3C2410_RTCALM_SECEN)
                 alm_tm->tm_sec = bcd2bin(alm_tm->tm_sec);
-       else
-               alm_tm->tm_sec = -1;
  
         if (alm_en & S3C2410_RTCALM_MINEN)
                 alm_tm->tm_min = bcd2bin(alm_tm->tm_min);
-       else
-               alm_tm->tm_min = -1;
  
         if (alm_en & S3C2410_RTCALM_HOUREN)
                 alm_tm->tm_hour = bcd2bin(alm_tm->tm_hour);
-       else
-               alm_tm->tm_hour = -1;
  
         if (alm_en & S3C2410_RTCALM_DAYEN)
                 alm_tm->tm_mday = bcd2bin(alm_tm->tm_mday);
-       else
-               alm_tm->tm_mday = -1;
  
         if (alm_en & S3C2410_RTCALM_MONEN) {
                 alm_tm->tm_mon = bcd2bin(alm_tm->tm_mon);
                 alm_tm->tm_mon -= 1;
-       } else {
-               alm_tm->tm_mon = -1;
         }
  
         if (alm_en & S3C2410_RTCALM_YEAREN)
                 alm_tm->tm_year = bcd2bin(alm_tm->tm_year);
-       else
-               alm_tm->tm_year = -1;
  
         return 0;
  }
@@ -577,8 +567,6 @@ static int s3c_rtc_probe(struct platform_device *pdev)
  
         s3c_rtc_setfreq(info, 1);
  
-       s3c_rtc_disable_clk(info);
-
         return 0;
  
   err_nortc:
diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c

index a45845a571e51a693d79143333d84f28bce28455..17b6235d67a588e9dff65f1c4689c923d939d8ff 100644 (file)
--- a/drivers/rtc/rtc-sh.c
+++ b/drivers/rtc/rtc-sh.c
@@ -481,7 +481,6 @@ static int sh_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
         tm->tm_mon      = sh_rtc_read_alarm_value(rtc, RMONAR);
         if (tm->tm_mon > 0)
                 tm->tm_mon -= 1; /* RTC is 1-12, tm_mon is 0-11 */
-       tm->tm_year     = 0xffff;
  
         wkalrm->enabled = (readb(rtc->regbase + RCR1) & RCR1_AIE) ? 1 : 0;
  
@@ -500,52 +499,13 @@ static inline void sh_rtc_write_alarm_value(struct sh_rtc *rtc,
                 writeb(bin2bcd(value) | AR_ENB,  rtc->regbase + reg_off);
  }
  
-static int sh_rtc_check_alarm(struct rtc_time *tm)
-{
-       /*
-        * The original rtc says anything > 0xc0 is "don't care" or "match
-        * all" - most users use 0xff but rtc-dev uses -1 for the same thing.
-        * The original rtc doesn't support years - some things use -1 and
-        * some 0xffff. We use -1 to make out tests easier.
-        */
-       if (tm->tm_year == 0xffff)
-               tm->tm_year = -1;
-       if (tm->tm_mon >= 0xff)
-               tm->tm_mon = -1;
-       if (tm->tm_mday >= 0xff)
-               tm->tm_mday = -1;
-       if (tm->tm_wday >= 0xff)
-               tm->tm_wday = -1;
-       if (tm->tm_hour >= 0xff)
-               tm->tm_hour = -1;
-       if (tm->tm_min >= 0xff)
-               tm->tm_min = -1;
-       if (tm->tm_sec >= 0xff)
-               tm->tm_sec = -1;
-
-       if (tm->tm_year > 9999 ||
-               tm->tm_mon >= 12 ||
-               tm->tm_mday == 0 || tm->tm_mday >= 32 ||
-               tm->tm_wday >= 7 ||
-               tm->tm_hour >= 24 ||
-               tm->tm_min >= 60 ||
-               tm->tm_sec >= 60)
-               return -EINVAL;
-
-       return 0;
-}
-
  static int sh_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
  {
         struct platform_device *pdev = to_platform_device(dev);
         struct sh_rtc *rtc = platform_get_drvdata(pdev);
         unsigned int rcr1;
         struct rtc_time *tm = &wkalrm->time;
-       int mon, err;
-
-       err = sh_rtc_check_alarm(tm);
-       if (unlikely(err < 0))
-               return err;
+       int mon;
  
         spin_lock_irq(&rtc->lock);
  
diff --git a/drivers/rtc/rtc-tegra.c b/drivers/rtc/rtc-tegra.c

index 60232bd366ef1499fabaf6d23c8f7538961dcf8e..15ac597d54da20515847ced69d342ae3051f255e 100644 (file)
--- a/drivers/rtc/rtc-tegra.c
+++ b/drivers/rtc/rtc-tegra.c
@@ -179,12 +179,6 @@ static int tegra_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm)
         if (sec == 0) {
                 /* alarm is disabled. */
                 alarm->enabled = 0;
-               alarm->time.tm_mon = -1;
-               alarm->time.tm_mday = -1;
-               alarm->time.tm_year = -1;
-               alarm->time.tm_hour = -1;
-               alarm->time.tm_min = -1;
-               alarm->time.tm_sec = -1;
         } else {
                 /* alarm is enabled. */
                 alarm->enabled = 1;
diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c

index 7a0436329d6ca08e7f43ed7b9c20bdcf5fd6b137..1f3117b5a83cf32ddfc9bfc1eceb11684ba0cae3 100644 (file)
--- a/drivers/rtc/rtc-v3020.c
+++ b/drivers/rtc/rtc-v3020.c
@@ -25,7 +25,7 @@
  #include <linux/rtc.h>
  #include <linux/types.h>
  #include <linux/bcd.h>
-#include <linux/rtc-v3020.h>
+#include <linux/platform_data/rtc-v3020.h>
  #include <linux/delay.h>
  #include <linux/gpio.h>
  #include <linux/slab.h>
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig

index 1918f5483b23debd923aa5412e9797a2e6bbb46d..7d1b4317eccc1ddea1b380db3900052b731377f7 100644 (file)
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -838,6 +838,23 @@ config SCSI_IBMVSCSI
           To compile this driver as a module, choose M here: the
           module will be called ibmvscsi.
  
+config SCSI_IBMVSCSIS
+       tristate "IBM Virtual SCSI Server support"
+       depends on PPC_PSERIES && TARGET_CORE && SCSI && PCI
+       help
+         This is the IBM POWER Virtual SCSI Target Server
+         This driver uses the SRP protocol for communication betwen servers
+         guest and/or the host that run on the same server.
+         More information on VSCSI protocol can be found at www.power.org
+
+         The userspace configuration needed to initialize the driver can be
+         be found here:
+
+         https://github.com/powervm/ibmvscsis/wiki/Configuration
+
+         To compile this driver as a module, choose M here: the
+         module will be called ibmvscsis.
+
  config SCSI_IBMVFC
         tristate "IBM Virtual FC support"
         depends on PPC_PSERIES && SCSI
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile

index 862ab4efad61e90de24c506675801ec8d16ecff5..d5397987e731b7affcc507ece87f0360bb70327b 100644 (file)
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -128,6 +128,7 @@ obj-$(CONFIG_SCSI_SNI_53C710)       += 53c700.o sni_53c710.o
  obj-$(CONFIG_SCSI_NSP32)       += nsp32.o
  obj-$(CONFIG_SCSI_IPR)         += ipr.o
  obj-$(CONFIG_SCSI_IBMVSCSI)    += ibmvscsi/
+obj-$(CONFIG_SCSI_IBMVSCSIS)   += ibmvscsi_tgt/
  obj-$(CONFIG_SCSI_IBMVFC)      += ibmvscsi/
  obj-$(CONFIG_SCSI_HPTIOP)      += hptiop.o
  obj-$(CONFIG_SCSI_STEX)                += stex.o
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h

index 8fae03215a85d2d26920146617cbfc68c8a845ed..5c70a52ad3466cdb31a8207ac9b664285ad797c6 100644 (file)
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -26,7 +26,7 @@
  
  #include <linux/list.h>
  #include <linux/types.h>
-#include "viosrp.h"
+#include <scsi/viosrp.h>
  
  #define IBMVFC_NAME    "ibmvfc"
  #define IBMVFC_DRIVER_VERSION          "1.0.11"
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.h b/drivers/scsi/ibmvscsi/ibmvscsi.h

index 1067367395cd4fa216f160cf6578195c42e7cbd0..e0f6c3aeb4eef35aa390afaa5d7cd6174d8f7a48 100644 (file)
--- a/drivers/scsi/ibmvscsi/ibmvscsi.h
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.h
@@ -33,7 +33,7 @@
  #include <linux/list.h>
  #include <linux/completion.h>
  #include <linux/interrupt.h>
-#include "viosrp.h"
+#include <scsi/viosrp.h>
  
  struct scsi_cmnd;
  struct Scsi_Host;
diff --git a/drivers/scsi/ibmvscsi/viosrp.h b/drivers/scsi/ibmvscsi/viosrp.h

deleted file mode 100644 (file)

index c1ab8a4..0000000
--- a/drivers/scsi/ibmvscsi/viosrp.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/*****************************************************************************/
-/* srp.h -- SCSI RDMA Protocol definitions                                   */
-/*                                                                           */
-/* Written By: Colin Devilbis, IBM Corporation                               */
-/*                                                                           */
-/* Copyright (C) 2003 IBM Corporation                                        */
-/*                                                                           */
-/* This program is free software; you can redistribute it and/or modify      */
-/* it under the terms of the GNU General Public License as published by      */
-/* the Free Software Foundation; either version 2 of the License, or         */
-/* (at your option) any later version.                                       */
-/*                                                                           */
-/* This program is distributed in the hope that it will be useful,           */
-/* but WITHOUT ANY WARRANTY; without even the implied warranty of            */
-/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             */
-/* GNU General Public License for more details.                              */
-/*                                                                           */
-/* You should have received a copy of the GNU General Public License         */
-/* along with this program; if not, write to the Free Software               */
-/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
-/*                                                                           */
-/*                                                                           */
-/* This file contains structures and definitions for IBM RPA (RS/6000        */
-/* platform architecture) implementation of the SRP (SCSI RDMA Protocol)     */
-/* standard.  SRP is used on IBM iSeries and pSeries platforms to send SCSI  */
-/* commands between logical partitions.                                      */
-/*                                                                           */
-/* SRP Information Units (IUs) are sent on a "Command/Response Queue" (CRQ)  */
-/* between partitions.  The definitions in this file are architected,        */
-/* and cannot be changed without breaking compatibility with other versions  */
-/* of Linux and other operating systems (AIX, OS/400) that talk this protocol*/
-/* between logical partitions                                                */
-/*****************************************************************************/
-#ifndef VIOSRP_H
-#define VIOSRP_H
-#include <scsi/srp.h>
-
-#define SRP_VERSION "16.a"
-#define SRP_MAX_IU_LEN 256
-#define SRP_MAX_LOC_LEN 32
-
-union srp_iu {
-       struct srp_login_req login_req;
-       struct srp_login_rsp login_rsp;
-       struct srp_login_rej login_rej;
-       struct srp_i_logout i_logout;
-       struct srp_t_logout t_logout;
-       struct srp_tsk_mgmt tsk_mgmt;
-       struct srp_cmd cmd;
-       struct srp_rsp rsp;
-       u8 reserved[SRP_MAX_IU_LEN];
-};
-
-enum viosrp_crq_headers {
-       VIOSRP_CRQ_FREE = 0x00,
-       VIOSRP_CRQ_CMD_RSP = 0x80,
-       VIOSRP_CRQ_INIT_RSP = 0xC0,
-       VIOSRP_CRQ_XPORT_EVENT = 0xFF
-};
-
-enum viosrp_crq_init_formats {
-       VIOSRP_CRQ_INIT = 0x01,
-       VIOSRP_CRQ_INIT_COMPLETE = 0x02
-};
-
-enum viosrp_crq_formats {
-       VIOSRP_SRP_FORMAT = 0x01,
-       VIOSRP_MAD_FORMAT = 0x02,
-       VIOSRP_OS400_FORMAT = 0x03,
-       VIOSRP_AIX_FORMAT = 0x04,
-       VIOSRP_LINUX_FORMAT = 0x05,
-       VIOSRP_INLINE_FORMAT = 0x06
-};
-
-enum viosrp_crq_status {
-       VIOSRP_OK = 0x0,
-       VIOSRP_NONRECOVERABLE_ERR = 0x1,
-       VIOSRP_VIOLATES_MAX_XFER = 0x2,
-       VIOSRP_PARTNER_PANIC = 0x3,
-       VIOSRP_DEVICE_BUSY = 0x8,
-       VIOSRP_ADAPTER_FAIL = 0x10,
-       VIOSRP_OK2 = 0x99,
-};
-
-struct viosrp_crq {
-       u8 valid;               /* used by RPA */
-       u8 format;              /* SCSI vs out-of-band */
-       u8 reserved;
-       u8 status;              /* non-scsi failure? (e.g. DMA failure) */
-       __be16 timeout;         /* in seconds */
-       __be16 IU_length;               /* in bytes */
-       __be64 IU_data_ptr;     /* the TCE for transferring data */
-};
-
-/* MADs are Management requests above and beyond the IUs defined in the SRP
- * standard.  
- */
-enum viosrp_mad_types {
-       VIOSRP_EMPTY_IU_TYPE = 0x01,
-       VIOSRP_ERROR_LOG_TYPE = 0x02,
-       VIOSRP_ADAPTER_INFO_TYPE = 0x03,
-       VIOSRP_CAPABILITIES_TYPE = 0x05,
-       VIOSRP_ENABLE_FAST_FAIL = 0x08,
-};
-
-enum viosrp_mad_status {
-       VIOSRP_MAD_SUCCESS = 0x00,
-       VIOSRP_MAD_NOT_SUPPORTED = 0xF1,
-       VIOSRP_MAD_FAILED = 0xF7,
-};
-
-enum viosrp_capability_type {
-       MIGRATION_CAPABILITIES = 0x01,
-       RESERVATION_CAPABILITIES = 0x02,
-};
-
-enum viosrp_capability_support {
-       SERVER_DOES_NOT_SUPPORTS_CAP = 0x0,
-       SERVER_SUPPORTS_CAP = 0x01,
-       SERVER_CAP_DATA = 0x02,
-};
-
-enum viosrp_reserve_type {
-       CLIENT_RESERVE_SCSI_2 = 0x01,
-};
-
-enum viosrp_capability_flag {
-       CLIENT_MIGRATED = 0x01,
-       CLIENT_RECONNECT = 0x02,
-       CAP_LIST_SUPPORTED = 0x04,
-       CAP_LIST_DATA = 0x08,
-};
-
-/* 
- * Common MAD header
- */
-struct mad_common {
-       __be32 type;
-       __be16 status;
-       __be16 length;
-       __be64 tag;
-};
-
-/*
- * All SRP (and MAD) requests normally flow from the
- * client to the server.  There is no way for the server to send
- * an asynchronous message back to the client.  The Empty IU is used
- * to hang out a meaningless request to the server so that it can respond
- * asynchrouously with something like a SCSI AER 
- */
-struct viosrp_empty_iu {
-       struct mad_common common;
-       __be64 buffer;
-       __be32 port;
-};
-
-struct viosrp_error_log {
-       struct mad_common common;
-       __be64 buffer;
-};
-
-struct viosrp_adapter_info {
-       struct mad_common common;
-       __be64 buffer;
-};
-
-struct viosrp_fast_fail {
-       struct mad_common common;
-};
-
-struct viosrp_capabilities {
-       struct mad_common common;
-       __be64 buffer;
-};
-
-struct mad_capability_common {
-       __be32 cap_type;
-       __be16 length;
-       __be16 server_support;
-};
-
-struct mad_reserve_cap {
-       struct mad_capability_common common;
-       __be32 type;
-};
-
-struct mad_migration_cap {
-       struct mad_capability_common common;
-       __be32 ecl;
-};
-
-struct capabilities{
-       __be32 flags;
-       char name[SRP_MAX_LOC_LEN];
-       char loc[SRP_MAX_LOC_LEN];
-       struct mad_migration_cap migration;
-       struct mad_reserve_cap reserve;
-};
-
-union mad_iu {
-       struct viosrp_empty_iu empty_iu;
-       struct viosrp_error_log error_log;
-       struct viosrp_adapter_info adapter_info;
-       struct viosrp_fast_fail fast_fail;
-       struct viosrp_capabilities capabilities;
-};
-
-union viosrp_iu {
-       union srp_iu srp;
-       union mad_iu mad;
-};
-
-struct mad_adapter_info_data {
-       char srp_version[8];
-       char partition_name[96];
-       __be32 partition_number;
-#define SRP_MAD_VERSION_1 1
-       __be32 mad_version;
-#define SRP_MAD_OS_LINUX 2
-#define SRP_MAD_OS_AIX 3
-       __be32 os_type;
-       __be32 port_max_txu[8]; /* per-port maximum transfer */
-};
-
-#endif
diff --git a/drivers/scsi/ibmvscsi_tgt/Makefile b/drivers/scsi/ibmvscsi_tgt/Makefile

new file mode 100644 (file)

index 0000000..0c060ce
--- /dev/null
+++ b/drivers/scsi/ibmvscsi_tgt/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_SCSI_IBMVSCSIS)   += ibmvscsis.o
+
+ibmvscsis-y := libsrp.o ibmvscsi_tgt.o
diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c

new file mode 100644 (file)

index 0000000..b29fef9
--- /dev/null
+++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c
@@ -0,0 +1,4087 @@
+/*******************************************************************************
+ * IBM Virtual SCSI Target Driver
+ * Copyright (C) 2003-2005 Dave Boutcher (boutcher@us.ibm.com) IBM Corp.
+ *                        Santiago Leon (santil@us.ibm.com) IBM Corp.
+ *                        Linda Xie (lxie@us.ibm.com) IBM Corp.
+ *
+ * Copyright (C) 2005-2011 FUJITA Tomonori <tomof@acm.org>
+ * Copyright (C) 2010 Nicholas A. Bellinger <nab@kernel.org>
+ *
+ * Authors: Bryant G. Ly <bryantly@linux.vnet.ibm.com>
+ * Authors: Michael Cyr <mikecyr@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ ****************************************************************************/
+
+#define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/string.h>
+
+#include <target/target_core_base.h>
+#include <target/target_core_fabric.h>
+
+#include <asm/hvcall.h>
+#include <asm/vio.h>
+
+#include <scsi/viosrp.h>
+
+#include "ibmvscsi_tgt.h"
+
+#define IBMVSCSIS_VERSION      "v0.2"
+
+#define        INITIAL_SRP_LIMIT       800
+#define        DEFAULT_MAX_SECTORS     256
+
+static uint max_vdma_size = MAX_H_COPY_RDMA;
+
+static char system_id[SYS_ID_NAME_LEN] = "";
+static char partition_name[PARTITION_NAMELEN] = "UNKNOWN";
+static uint partition_number = -1;
+
+/* Adapter list and lock to control it */
+static DEFINE_SPINLOCK(ibmvscsis_dev_lock);
+static LIST_HEAD(ibmvscsis_dev_list);
+
+static long ibmvscsis_parse_command(struct scsi_info *vscsi,
+                                   struct viosrp_crq *crq);
+
+static void ibmvscsis_adapter_idle(struct scsi_info *vscsi);
+
+static void ibmvscsis_determine_resid(struct se_cmd *se_cmd,
+                                     struct srp_rsp *rsp)
+{
+       u32 residual_count = se_cmd->residual_count;
+
+       if (!residual_count)
+               return;
+
+       if (se_cmd->se_cmd_flags & SCF_UNDERFLOW_BIT) {
+               if (se_cmd->data_direction == DMA_TO_DEVICE) {
+                       /* residual data from an underflow write */
+                       rsp->flags = SRP_RSP_FLAG_DOUNDER;
+                       rsp->data_out_res_cnt = cpu_to_be32(residual_count);
+               } else if (se_cmd->data_direction == DMA_FROM_DEVICE) {
+                       /* residual data from an underflow read */
+                       rsp->flags = SRP_RSP_FLAG_DIUNDER;
+                       rsp->data_in_res_cnt = cpu_to_be32(residual_count);
+               }
+       } else if (se_cmd->se_cmd_flags & SCF_OVERFLOW_BIT) {
+               if (se_cmd->data_direction == DMA_TO_DEVICE) {
+                       /*  residual data from an overflow write */
+                       rsp->flags = SRP_RSP_FLAG_DOOVER;
+                       rsp->data_out_res_cnt = cpu_to_be32(residual_count);
+               } else if (se_cmd->data_direction == DMA_FROM_DEVICE) {
+                       /* residual data from an overflow read */
+                       rsp->flags = SRP_RSP_FLAG_DIOVER;
+                       rsp->data_in_res_cnt = cpu_to_be32(residual_count);
+               }
+       }
+}
+
+/**
+ * connection_broken() - Determine if the connection to the client is good
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * This function attempts to send a ping MAD to the client. If the call to
+ * queue the request returns H_CLOSED then the connection has been broken
+ * and the function returns TRUE.
+ *
+ * EXECUTION ENVIRONMENT:
+ *      Interrupt or Process environment
+ */
+static bool connection_broken(struct scsi_info *vscsi)
+{
+       struct viosrp_crq *crq;
+       u64 buffer[2] = { 0, 0 };
+       long h_return_code;
+       bool rc = false;
+
+       /* create a PING crq */
+       crq = (struct viosrp_crq *)&buffer;
+       crq->valid = VALID_CMD_RESP_EL;
+       crq->format = MESSAGE_IN_CRQ;
+       crq->status = PING;
+
+       h_return_code = h_send_crq(vscsi->dds.unit_id,
+                                  cpu_to_be64(buffer[MSG_HI]),
+                                  cpu_to_be64(buffer[MSG_LOW]));
+
+       pr_debug("connection_broken: rc %ld\n", h_return_code);
+
+       if (h_return_code == H_CLOSED)
+               rc = true;
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_unregister_command_q() - Helper Function-Unregister Command Queue
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * This function calls h_free_q then frees the interrupt bit etc.
+ * It must release the lock before doing so because of the time it can take
+ * for h_free_crq in PHYP
+ * NOTE: the caller must make sure that state and or flags will prevent
+ *      interrupt handler from scheduling work.
+ * NOTE: anyone calling this function may need to set the CRQ_CLOSED flag
+ *      we can't do it here, because we don't have the lock
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Process level
+ */
+static long ibmvscsis_unregister_command_q(struct scsi_info *vscsi)
+{
+       long qrc;
+       long rc = ADAPT_SUCCESS;
+       int ticks = 0;
+
+       do {
+               qrc = h_free_crq(vscsi->dds.unit_id);
+               switch (qrc) {
+               case H_SUCCESS:
+                       break;
+
+               case H_HARDWARE:
+               case H_PARAMETER:
+                       dev_err(&vscsi->dev, "unregister_command_q: error from h_free_crq %ld\n",
+                               qrc);
+                       rc = ERROR;
+                       break;
+
+               case H_BUSY:
+               case H_LONG_BUSY_ORDER_1_MSEC:
+                       /* msleep not good for small values */
+                       usleep_range(1000, 2000);
+                       ticks += 1;
+                       break;
+               case H_LONG_BUSY_ORDER_10_MSEC:
+                       usleep_range(10000, 20000);
+                       ticks += 10;
+                       break;
+               case H_LONG_BUSY_ORDER_100_MSEC:
+                       msleep(100);
+                       ticks += 100;
+                       break;
+               case H_LONG_BUSY_ORDER_1_SEC:
+                       ssleep(1);
+                       ticks += 1000;
+                       break;
+               case H_LONG_BUSY_ORDER_10_SEC:
+                       ssleep(10);
+                       ticks += 10000;
+                       break;
+               case H_LONG_BUSY_ORDER_100_SEC:
+                       ssleep(100);
+                       ticks += 100000;
+                       break;
+               default:
+                       dev_err(&vscsi->dev, "unregister_command_q: unknown error %ld from h_free_crq\n",
+                               qrc);
+                       rc = ERROR;
+                       break;
+               }
+
+               /*
+                * dont wait more then 300 seconds
+                * ticks are in milliseconds more or less
+                */
+               if (ticks > 300000 && qrc != H_SUCCESS) {
+                       rc = ERROR;
+                       dev_err(&vscsi->dev, "Excessive wait for h_free_crq\n");
+               }
+       } while (qrc != H_SUCCESS && rc == ADAPT_SUCCESS);
+
+       pr_debug("Freeing CRQ: phyp rc %ld, rc %ld\n", qrc, rc);
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_delete_client_info() - Helper function to Delete Client Info
+ * @vscsi:     Pointer to our adapter structure
+ * @client_closed:     True if client closed its queue
+ *
+ * Deletes information specific to the client when the client goes away
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt or Process
+ */
+static void ibmvscsis_delete_client_info(struct scsi_info *vscsi,
+                                        bool client_closed)
+{
+       vscsi->client_cap = 0;
+
+       /*
+        * Some things we don't want to clear if we're closing the queue,
+        * because some clients don't resend the host handshake when they
+        * get a transport event.
+        */
+       if (client_closed)
+               vscsi->client_data.os_type = 0;
+}
+
+/**
+ * ibmvscsis_free_command_q() - Free Command Queue
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * This function calls unregister_command_q, then clears interrupts and
+ * any pending interrupt acknowledgments associated with the command q.
+ * It also clears memory if there is no error.
+ *
+ * PHYP did not meet the PAPR architecture so that we must give up the
+ * lock. This causes a timing hole regarding state change.  To close the
+ * hole this routine does accounting on any change that occurred during
+ * the time the lock is not held.
+ * NOTE: must give up and then acquire the interrupt lock, the caller must
+ *      make sure that state and or flags will prevent interrupt handler from
+ *      scheduling work.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Process level, interrupt lock is held
+ */
+static long ibmvscsis_free_command_q(struct scsi_info *vscsi)
+{
+       int bytes;
+       u32 flags_under_lock;
+       u16 state_under_lock;
+       long rc = ADAPT_SUCCESS;
+
+       if (!(vscsi->flags & CRQ_CLOSED)) {
+               vio_disable_interrupts(vscsi->dma_dev);
+
+               state_under_lock = vscsi->new_state;
+               flags_under_lock = vscsi->flags;
+               vscsi->phyp_acr_state = 0;
+               vscsi->phyp_acr_flags = 0;
+
+               spin_unlock_bh(&vscsi->intr_lock);
+               rc = ibmvscsis_unregister_command_q(vscsi);
+               spin_lock_bh(&vscsi->intr_lock);
+
+               if (state_under_lock != vscsi->new_state)
+                       vscsi->phyp_acr_state = vscsi->new_state;
+
+               vscsi->phyp_acr_flags = ((~flags_under_lock) & vscsi->flags);
+
+               if (rc == ADAPT_SUCCESS) {
+                       bytes = vscsi->cmd_q.size * PAGE_SIZE;
+                       memset(vscsi->cmd_q.base_addr, 0, bytes);
+                       vscsi->cmd_q.index = 0;
+                       vscsi->flags |= CRQ_CLOSED;
+
+                       ibmvscsis_delete_client_info(vscsi, false);
+               }
+
+               pr_debug("free_command_q: flags 0x%x, state 0x%hx, acr_flags 0x%x, acr_state 0x%hx\n",
+                        vscsi->flags, vscsi->state, vscsi->phyp_acr_flags,
+                        vscsi->phyp_acr_state);
+       }
+       return rc;
+}
+
+/**
+ * ibmvscsis_cmd_q_dequeue() - Get valid Command element
+ * @mask:      Mask to use in case index wraps
+ * @current_index:     Current index into command queue
+ * @base_addr: Pointer to start of command queue
+ *
+ * Returns a pointer to a valid command element or NULL, if the command
+ * queue is empty
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt environment, interrupt lock held
+ */
+static struct viosrp_crq *ibmvscsis_cmd_q_dequeue(uint mask,
+                                                 uint *current_index,
+                                                 struct viosrp_crq *base_addr)
+{
+       struct viosrp_crq *ptr;
+
+       ptr = base_addr + *current_index;
+
+       if (ptr->valid) {
+               *current_index = (*current_index + 1) & mask;
+               dma_rmb();
+       } else {
+               ptr = NULL;
+       }
+
+       return ptr;
+}
+
+/**
+ * ibmvscsis_send_init_message() -  send initialize message to the client
+ * @vscsi:     Pointer to our adapter structure
+ * @format:    Which Init Message format to send
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt environment interrupt lock held
+ */
+static long ibmvscsis_send_init_message(struct scsi_info *vscsi, u8 format)
+{
+       struct viosrp_crq *crq;
+       u64 buffer[2] = { 0, 0 };
+       long rc;
+
+       crq = (struct viosrp_crq *)&buffer;
+       crq->valid = VALID_INIT_MSG;
+       crq->format = format;
+       rc = h_send_crq(vscsi->dds.unit_id, cpu_to_be64(buffer[MSG_HI]),
+                       cpu_to_be64(buffer[MSG_LOW]));
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_check_init_msg() - Check init message valid
+ * @vscsi:     Pointer to our adapter structure
+ * @format:    Pointer to return format of Init Message, if any.
+ *             Set to UNUSED_FORMAT if no Init Message in queue.
+ *
+ * Checks if an initialize message was queued by the initiatior
+ * after the queue was created and before the interrupt was enabled.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Process level only, interrupt lock held
+ */
+static long ibmvscsis_check_init_msg(struct scsi_info *vscsi, uint *format)
+{
+       struct viosrp_crq *crq;
+       long rc = ADAPT_SUCCESS;
+
+       crq = ibmvscsis_cmd_q_dequeue(vscsi->cmd_q.mask, &vscsi->cmd_q.index,
+                                     vscsi->cmd_q.base_addr);
+       if (!crq) {
+               *format = (uint)UNUSED_FORMAT;
+       } else if (crq->valid == VALID_INIT_MSG && crq->format == INIT_MSG) {
+               *format = (uint)INIT_MSG;
+               crq->valid = INVALIDATE_CMD_RESP_EL;
+               dma_rmb();
+
+               /*
+                * the caller has ensured no initialize message was
+                * sent after the queue was
+                * created so there should be no other message on the queue.
+                */
+               crq = ibmvscsis_cmd_q_dequeue(vscsi->cmd_q.mask,
+                                             &vscsi->cmd_q.index,
+                                             vscsi->cmd_q.base_addr);
+               if (crq) {
+                       *format = (uint)(crq->format);
+                       rc =  ERROR;
+                       crq->valid = INVALIDATE_CMD_RESP_EL;
+                       dma_rmb();
+               }
+       } else {
+               *format = (uint)(crq->format);
+               rc =  ERROR;
+               crq->valid = INVALIDATE_CMD_RESP_EL;
+               dma_rmb();
+       }
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_establish_new_q() - Establish new CRQ queue
+ * @vscsi:     Pointer to our adapter structure
+ * @new_state: New state being established after resetting the queue
+ *
+ * Must be called with interrupt lock held.
+ */
+static long ibmvscsis_establish_new_q(struct scsi_info *vscsi,  uint new_state)
+{
+       long rc = ADAPT_SUCCESS;
+       uint format;
+
+       vscsi->flags &= PRESERVE_FLAG_FIELDS;
+       vscsi->rsp_q_timer.timer_pops = 0;
+       vscsi->debit = 0;
+       vscsi->credit = 0;
+
+       rc = vio_enable_interrupts(vscsi->dma_dev);
+       if (rc) {
+               pr_warn("reset_queue: failed to enable interrupts, rc %ld\n",
+                       rc);
+               return rc;
+       }
+
+       rc = ibmvscsis_check_init_msg(vscsi, &format);
+       if (rc) {
+               dev_err(&vscsi->dev, "reset_queue: check_init_msg failed, rc %ld\n",
+                       rc);
+               return rc;
+       }
+
+       if (format == UNUSED_FORMAT && new_state == WAIT_CONNECTION) {
+               rc = ibmvscsis_send_init_message(vscsi, INIT_MSG);
+               switch (rc) {
+               case H_SUCCESS:
+               case H_DROPPED:
+               case H_CLOSED:
+                       rc = ADAPT_SUCCESS;
+                       break;
+
+               case H_PARAMETER:
+               case H_HARDWARE:
+                       break;
+
+               default:
+                       vscsi->state = UNDEFINED;
+                       rc = H_HARDWARE;
+                       break;
+               }
+       }
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_reset_queue() - Reset CRQ Queue
+ * @vscsi:     Pointer to our adapter structure
+ * @new_state: New state to establish after resetting the queue
+ *
+ * This function calls h_free_q and then calls h_reg_q and does all
+ * of the bookkeeping to get us back to where we can communicate.
+ *
+ * Actually, we don't always call h_free_crq.  A problem was discovered
+ * where one partition would close and reopen his queue, which would
+ * cause his partner to get a transport event, which would cause him to
+ * close and reopen his queue, which would cause the original partition
+ * to get a transport event, etc., etc.  To prevent this, we don't
+ * actually close our queue if the client initiated the reset, (i.e.
+ * either we got a transport event or we have detected that the client's
+ * queue is gone)
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Process environment, called with interrupt lock held
+ */
+static void ibmvscsis_reset_queue(struct scsi_info *vscsi, uint new_state)
+{
+       int bytes;
+       long rc = ADAPT_SUCCESS;
+
+       pr_debug("reset_queue: flags 0x%x\n", vscsi->flags);
+
+       /* don't reset, the client did it for us */
+       if (vscsi->flags & (CLIENT_FAILED | TRANS_EVENT)) {
+               vscsi->flags &=  PRESERVE_FLAG_FIELDS;
+               vscsi->rsp_q_timer.timer_pops = 0;
+               vscsi->debit = 0;
+               vscsi->credit = 0;
+               vscsi->state = new_state;
+               vio_enable_interrupts(vscsi->dma_dev);
+       } else {
+               rc = ibmvscsis_free_command_q(vscsi);
+               if (rc == ADAPT_SUCCESS) {
+                       vscsi->state = new_state;
+
+                       bytes = vscsi->cmd_q.size * PAGE_SIZE;
+                       rc = h_reg_crq(vscsi->dds.unit_id,
+                                      vscsi->cmd_q.crq_token, bytes);
+                       if (rc == H_CLOSED || rc == H_SUCCESS) {
+                               rc = ibmvscsis_establish_new_q(vscsi,
+                                                              new_state);
+                       }
+
+                       if (rc != ADAPT_SUCCESS) {
+                               pr_debug("reset_queue: reg_crq rc %ld\n", rc);
+
+                               vscsi->state = ERR_DISCONNECTED;
+                               vscsi->flags |=  RESPONSE_Q_DOWN;
+                               ibmvscsis_free_command_q(vscsi);
+                       }
+               } else {
+                       vscsi->state = ERR_DISCONNECTED;
+                       vscsi->flags |= RESPONSE_Q_DOWN;
+               }
+       }
+}
+
+/**
+ * ibmvscsis_free_cmd_resources() - Free command resources
+ * @vscsi:     Pointer to our adapter structure
+ * @cmd:       Command which is not longer in use
+ *
+ * Must be called with interrupt lock held.
+ */
+static void ibmvscsis_free_cmd_resources(struct scsi_info *vscsi,
+                                        struct ibmvscsis_cmd *cmd)
+{
+       struct iu_entry *iue = cmd->iue;
+
+       switch (cmd->type) {
+       case TASK_MANAGEMENT:
+       case SCSI_CDB:
+               /*
+                * When the queue goes down this value is cleared, so it
+                * cannot be cleared in this general purpose function.
+                */
+               if (vscsi->debit)
+                       vscsi->debit -= 1;
+               break;
+       case ADAPTER_MAD:
+               vscsi->flags &= ~PROCESSING_MAD;
+               break;
+       case UNSET_TYPE:
+               break;
+       default:
+               dev_err(&vscsi->dev, "free_cmd_resources unknown type %d\n",
+                       cmd->type);
+               break;
+       }
+
+       cmd->iue = NULL;
+       list_add_tail(&cmd->list, &vscsi->free_cmd);
+       srp_iu_put(iue);
+
+       if (list_empty(&vscsi->active_q) && list_empty(&vscsi->schedule_q) &&
+           list_empty(&vscsi->waiting_rsp) && (vscsi->flags & WAIT_FOR_IDLE)) {
+               vscsi->flags &= ~WAIT_FOR_IDLE;
+               complete(&vscsi->wait_idle);
+       }
+}
+
+/**
+ * ibmvscsis_disconnect() - Helper function to disconnect
+ * @work:      Pointer to work_struct, gives access to our adapter structure
+ *
+ * An error has occurred or the driver received a Transport event,
+ * and the driver is requesting that the command queue be de-registered
+ * in a safe manner. If there is no outstanding I/O then we can stop the
+ * queue. If we are restarting the queue it will be reflected in the
+ * the state of the adapter.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Process environment
+ */
+static void ibmvscsis_disconnect(struct work_struct *work)
+{
+       struct scsi_info *vscsi = container_of(work, struct scsi_info,
+                                              proc_work);
+       u16 new_state;
+       bool wait_idle = false;
+       long rc = ADAPT_SUCCESS;
+
+       spin_lock_bh(&vscsi->intr_lock);
+       new_state = vscsi->new_state;
+       vscsi->new_state = 0;
+
+       pr_debug("disconnect: flags 0x%x, state 0x%hx\n", vscsi->flags,
+                vscsi->state);
+
+       /*
+        * check which state we are in and see if we
+        * should transitition to the new state
+        */
+       switch (vscsi->state) {
+       /*  Should never be called while in this state. */
+       case NO_QUEUE:
+       /*
+        * Can never transition from this state;
+        * igonore errors and logout.
+        */
+       case UNCONFIGURING:
+               break;
+
+       /* can transition from this state to UNCONFIGURING */
+       case ERR_DISCONNECT:
+               if (new_state == UNCONFIGURING)
+                       vscsi->state = new_state;
+               break;
+
+       /*
+        * Can transition from this state to to unconfiguring
+        * or err disconnect.
+        */
+       case ERR_DISCONNECT_RECONNECT:
+               switch (new_state) {
+               case UNCONFIGURING:
+               case ERR_DISCONNECT:
+                       vscsi->state = new_state;
+                       break;
+
+               case WAIT_IDLE:
+                       break;
+               default:
+                       break;
+               }
+               break;
+
+       /* can transition from this state to UNCONFIGURING */
+       case ERR_DISCONNECTED:
+               if (new_state == UNCONFIGURING)
+                       vscsi->state = new_state;
+               break;
+
+       /*
+        * If this is a transition into an error state.
+        * a client is attempting to establish a connection
+        * and has violated the RPA protocol.
+        * There can be nothing pending on the adapter although
+        * there can be requests in the command queue.
+        */
+       case WAIT_ENABLED:
+       case PART_UP_WAIT_ENAB:
+               switch (new_state) {
+               case ERR_DISCONNECT:
+                       vscsi->flags |= RESPONSE_Q_DOWN;
+                       vscsi->state = new_state;
+                       vscsi->flags &= ~(SCHEDULE_DISCONNECT |
+                                         DISCONNECT_SCHEDULED);
+                       ibmvscsis_free_command_q(vscsi);
+                       break;
+               case ERR_DISCONNECT_RECONNECT:
+                       ibmvscsis_reset_queue(vscsi, WAIT_ENABLED);
+                       break;
+
+               /* should never happen */
+               case WAIT_IDLE:
+                       rc = ERROR;
+                       dev_err(&vscsi->dev, "disconnect: invalid state %d for WAIT_IDLE\n",
+                               vscsi->state);
+                       break;
+               }
+               break;
+
+       case WAIT_IDLE:
+               switch (new_state) {
+               case ERR_DISCONNECT:
+               case ERR_DISCONNECT_RECONNECT:
+                       vscsi->state = new_state;
+                       break;
+               }
+               break;
+
+       /*
+        * Initiator has not done a successful srp login
+        * or has done a successful srp logout ( adapter was not
+        * busy). In the first case there can be responses queued
+        * waiting for space on the initiators response queue (MAD)
+        * The second case the adapter is idle. Assume the worse case,
+        * i.e. the second case.
+        */
+       case WAIT_CONNECTION:
+       case CONNECTED:
+       case SRP_PROCESSING:
+               wait_idle = true;
+               vscsi->state = new_state;
+               break;
+
+       /* can transition from this state to UNCONFIGURING */
+       case UNDEFINED:
+               if (new_state == UNCONFIGURING)
+                       vscsi->state = new_state;
+               break;
+       default:
+               break;
+       }
+
+       if (wait_idle) {
+               pr_debug("disconnect start wait, active %d, sched %d\n",
+                        (int)list_empty(&vscsi->active_q),
+                        (int)list_empty(&vscsi->schedule_q));
+               if (!list_empty(&vscsi->active_q) ||
+                   !list_empty(&vscsi->schedule_q)) {
+                       vscsi->flags |= WAIT_FOR_IDLE;
+                       pr_debug("disconnect flags 0x%x\n", vscsi->flags);
+                       /*
+                        * This routine is can not be called with the interrupt
+                        * lock held.
+                        */
+                       spin_unlock_bh(&vscsi->intr_lock);
+                       wait_for_completion(&vscsi->wait_idle);
+                       spin_lock_bh(&vscsi->intr_lock);
+               }
+               pr_debug("disconnect stop wait\n");
+
+               ibmvscsis_adapter_idle(vscsi);
+       }
+
+       spin_unlock_bh(&vscsi->intr_lock);
+}
+
+/**
+ * ibmvscsis_post_disconnect() - Schedule the disconnect
+ * @vscsi:     Pointer to our adapter structure
+ * @new_state: State to move to after disconnecting
+ * @flag_bits: Flags to turn on in adapter structure
+ *
+ * If it's already been scheduled, then see if we need to "upgrade"
+ * the new state (if the one passed in is more "severe" than the
+ * previous one).
+ *
+ * PRECONDITION:
+ *     interrupt lock is held
+ */
+static void ibmvscsis_post_disconnect(struct scsi_info *vscsi, uint new_state,
+                                     uint flag_bits)
+{
+       uint state;
+
+       /* check the validity of the new state */
+       switch (new_state) {
+       case UNCONFIGURING:
+       case ERR_DISCONNECT:
+       case ERR_DISCONNECT_RECONNECT:
+       case WAIT_IDLE:
+               break;
+
+       default:
+               dev_err(&vscsi->dev, "post_disconnect: Invalid new state %d\n",
+                       new_state);
+               return;
+       }
+
+       vscsi->flags |= flag_bits;
+
+       pr_debug("post_disconnect: new_state 0x%x, flag_bits 0x%x, vscsi->flags 0x%x, state %hx\n",
+                new_state, flag_bits, vscsi->flags, vscsi->state);
+
+       if (!(vscsi->flags & (DISCONNECT_SCHEDULED | SCHEDULE_DISCONNECT))) {
+               vscsi->flags |= SCHEDULE_DISCONNECT;
+               vscsi->new_state = new_state;
+
+               INIT_WORK(&vscsi->proc_work, ibmvscsis_disconnect);
+               (void)queue_work(vscsi->work_q, &vscsi->proc_work);
+       } else {
+               if (vscsi->new_state)
+                       state = vscsi->new_state;
+               else
+                       state = vscsi->state;
+
+               switch (state) {
+               case NO_QUEUE:
+               case UNCONFIGURING:
+                       break;
+
+               case ERR_DISCONNECTED:
+               case ERR_DISCONNECT:
+               case UNDEFINED:
+                       if (new_state == UNCONFIGURING)
+                               vscsi->new_state = new_state;
+                       break;
+
+               case ERR_DISCONNECT_RECONNECT:
+                       switch (new_state) {
+                       case UNCONFIGURING:
+                       case ERR_DISCONNECT:
+                               vscsi->new_state = new_state;
+                               break;
+                       default:
+                               break;
+                       }
+                       break;
+
+               case WAIT_ENABLED:
+               case PART_UP_WAIT_ENAB:
+               case WAIT_IDLE:
+               case WAIT_CONNECTION:
+               case CONNECTED:
+               case SRP_PROCESSING:
+                       vscsi->new_state = new_state;
+                       break;
+
+               default:
+                       break;
+               }
+       }
+
+       pr_debug("Leaving post_disconnect: flags 0x%x, new_state 0x%x\n",
+                vscsi->flags, vscsi->new_state);
+}
+
+/**
+ * ibmvscsis_trans_event() - Handle a Transport Event
+ * @vscsi:     Pointer to our adapter structure
+ * @crq:       Pointer to CRQ entry containing the Transport Event
+ *
+ * Do the logic to close the I_T nexus.  This function may not
+ * behave to specification.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt, interrupt lock held
+ */
+static long ibmvscsis_trans_event(struct scsi_info *vscsi,
+                                 struct viosrp_crq *crq)
+{
+       long rc = ADAPT_SUCCESS;
+
+       pr_debug("trans_event: format %d, flags 0x%x, state 0x%hx\n",
+                (int)crq->format, vscsi->flags, vscsi->state);
+
+       switch (crq->format) {
+       case MIGRATED:
+       case PARTNER_FAILED:
+       case PARTNER_DEREGISTER:
+               ibmvscsis_delete_client_info(vscsi, true);
+               break;
+
+       default:
+               rc = ERROR;
+               dev_err(&vscsi->dev, "trans_event: invalid format %d\n",
+                       (uint)crq->format);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT,
+                                         RESPONSE_Q_DOWN);
+               break;
+       }
+
+       if (rc == ADAPT_SUCCESS) {
+               switch (vscsi->state) {
+               case NO_QUEUE:
+               case ERR_DISCONNECTED:
+               case UNDEFINED:
+                       break;
+
+               case UNCONFIGURING:
+                       vscsi->flags |= (RESPONSE_Q_DOWN | TRANS_EVENT);
+                       break;
+
+               case WAIT_ENABLED:
+                       break;
+
+               case WAIT_CONNECTION:
+                       break;
+
+               case CONNECTED:
+                       ibmvscsis_post_disconnect(vscsi, WAIT_IDLE,
+                                                 (RESPONSE_Q_DOWN |
+                                                  TRANS_EVENT));
+                       break;
+
+               case PART_UP_WAIT_ENAB:
+                       vscsi->state = WAIT_ENABLED;
+                       break;
+
+               case SRP_PROCESSING:
+                       if ((vscsi->debit > 0) ||
+                           !list_empty(&vscsi->schedule_q) ||
+                           !list_empty(&vscsi->waiting_rsp) ||
+                           !list_empty(&vscsi->active_q)) {
+                               pr_debug("debit %d, sched %d, wait %d, active %d\n",
+                                        vscsi->debit,
+                                        (int)list_empty(&vscsi->schedule_q),
+                                        (int)list_empty(&vscsi->waiting_rsp),
+                                        (int)list_empty(&vscsi->active_q));
+                               pr_warn("connection lost with outstanding work\n");
+                       } else {
+                               pr_debug("trans_event: SRP Processing, but no outstanding work\n");
+                       }
+
+                       ibmvscsis_post_disconnect(vscsi, WAIT_IDLE,
+                                                 (RESPONSE_Q_DOWN |
+                                                  TRANS_EVENT));
+                       break;
+
+               case ERR_DISCONNECT:
+               case ERR_DISCONNECT_RECONNECT:
+               case WAIT_IDLE:
+                       vscsi->flags |= (RESPONSE_Q_DOWN | TRANS_EVENT);
+                       break;
+               }
+       }
+
+       rc =  vscsi->flags & SCHEDULE_DISCONNECT;
+
+       pr_debug("Leaving trans_event: flags 0x%x, state 0x%hx, rc %ld\n",
+                vscsi->flags, vscsi->state, rc);
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_poll_cmd_q() - Poll Command Queue
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * Called to handle command elements that may have arrived while
+ * interrupts were disabled.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     intr_lock must be held
+ */
+static void ibmvscsis_poll_cmd_q(struct scsi_info *vscsi)
+{
+       struct viosrp_crq *crq;
+       long rc;
+       bool ack = true;
+       volatile u8 valid;
+
+       pr_debug("poll_cmd_q: flags 0x%x, state 0x%hx, q index %ud\n",
+                vscsi->flags, vscsi->state, vscsi->cmd_q.index);
+
+       rc = vscsi->flags & SCHEDULE_DISCONNECT;
+       crq = vscsi->cmd_q.base_addr + vscsi->cmd_q.index;
+       valid = crq->valid;
+       dma_rmb();
+
+       while (valid) {
+poll_work:
+               vscsi->cmd_q.index =
+                       (vscsi->cmd_q.index + 1) & vscsi->cmd_q.mask;
+
+               if (!rc) {
+                       rc = ibmvscsis_parse_command(vscsi, crq);
+               } else {
+                       if ((uint)crq->valid == VALID_TRANS_EVENT) {
+                               /*
+                                * must service the transport layer events even
+                                * in an error state, dont break out until all
+                                * the consecutive transport events have been
+                                * processed
+                                */
+                               rc = ibmvscsis_trans_event(vscsi, crq);
+                       } else if (vscsi->flags & TRANS_EVENT) {
+                               /*
+                                * if a tranport event has occurred leave
+                                * everything but transport events on the queue
+                                */
+                               pr_debug("poll_cmd_q, ignoring\n");
+
+                               /*
+                                * need to decrement the queue index so we can
+                                * look at the elment again
+                                */
+                               if (vscsi->cmd_q.index)
+                                       vscsi->cmd_q.index -= 1;
+                               else
+                                       /*
+                                        * index is at 0 it just wrapped.
+                                        * have it index last element in q
+                                        */
+                                       vscsi->cmd_q.index = vscsi->cmd_q.mask;
+                               break;
+                       }
+               }
+
+               crq->valid = INVALIDATE_CMD_RESP_EL;
+
+               crq = vscsi->cmd_q.base_addr + vscsi->cmd_q.index;
+               valid = crq->valid;
+               dma_rmb();
+       }
+
+       if (!rc) {
+               if (ack) {
+                       vio_enable_interrupts(vscsi->dma_dev);
+                       ack = false;
+                       pr_debug("poll_cmd_q, reenabling interrupts\n");
+               }
+               valid = crq->valid;
+               dma_rmb();
+               if (valid)
+                       goto poll_work;
+       }
+
+       pr_debug("Leaving poll_cmd_q: rc %ld\n", rc);
+}
+
+/**
+ * ibmvscsis_free_cmd_qs() - Free elements in queue
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * Free all of the elements on all queues that are waiting for
+ * whatever reason.
+ *
+ * PRECONDITION:
+ *     Called with interrupt lock held
+ */
+static void ibmvscsis_free_cmd_qs(struct scsi_info *vscsi)
+{
+       struct ibmvscsis_cmd *cmd, *nxt;
+
+       pr_debug("free_cmd_qs: waiting_rsp empty %d, timer starter %d\n",
+                (int)list_empty(&vscsi->waiting_rsp),
+                vscsi->rsp_q_timer.started);
+
+       list_for_each_entry_safe(cmd, nxt, &vscsi->waiting_rsp, list) {
+               list_del(&cmd->list);
+               ibmvscsis_free_cmd_resources(vscsi, cmd);
+       }
+}
+
+/**
+ * ibmvscsis_get_free_cmd() - Get free command from list
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * Must be called with interrupt lock held.
+ */
+static struct ibmvscsis_cmd *ibmvscsis_get_free_cmd(struct scsi_info *vscsi)
+{
+       struct ibmvscsis_cmd *cmd = NULL;
+       struct iu_entry *iue;
+
+       iue = srp_iu_get(&vscsi->target);
+       if (iue) {
+               cmd = list_first_entry_or_null(&vscsi->free_cmd,
+                                              struct ibmvscsis_cmd, list);
+               if (cmd) {
+                       list_del(&cmd->list);
+                       cmd->iue = iue;
+                       cmd->type = UNSET_TYPE;
+                       memset(&cmd->se_cmd, 0, sizeof(cmd->se_cmd));
+               } else {
+                       srp_iu_put(iue);
+               }
+       }
+
+       return cmd;
+}
+
+/**
+ * ibmvscsis_adapter_idle() - Helper function to handle idle adapter
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * This function is called when the adapter is idle when the driver
+ * is attempting to clear an error condition.
+ * The adapter is considered busy if any of its cmd queues
+ * are non-empty. This function can be invoked
+ * from the off level disconnect function.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Process environment called with interrupt lock held
+ */
+static void ibmvscsis_adapter_idle(struct scsi_info *vscsi)
+{
+       int free_qs = false;
+
+       pr_debug("adapter_idle: flags 0x%x, state 0x%hx\n", vscsi->flags,
+                vscsi->state);
+
+       /* Only need to free qs if we're disconnecting from client */
+       if (vscsi->state != WAIT_CONNECTION || vscsi->flags & TRANS_EVENT)
+               free_qs = true;
+
+       switch (vscsi->state) {
+       case ERR_DISCONNECT_RECONNECT:
+               ibmvscsis_reset_queue(vscsi, WAIT_CONNECTION);
+               pr_debug("adapter_idle, disc_rec: flags 0x%x\n", vscsi->flags);
+               break;
+
+       case ERR_DISCONNECT:
+               ibmvscsis_free_command_q(vscsi);
+               vscsi->flags &= ~DISCONNECT_SCHEDULED;
+               vscsi->flags |= RESPONSE_Q_DOWN;
+               vscsi->state = ERR_DISCONNECTED;
+               pr_debug("adapter_idle, disc: flags 0x%x, state 0x%hx\n",
+                        vscsi->flags, vscsi->state);
+               break;
+
+       case WAIT_IDLE:
+               vscsi->rsp_q_timer.timer_pops = 0;
+               vscsi->debit = 0;
+               vscsi->credit = 0;
+               if (vscsi->flags & TRANS_EVENT) {
+                       vscsi->state = WAIT_CONNECTION;
+                       vscsi->flags &= PRESERVE_FLAG_FIELDS;
+               } else {
+                       vscsi->state = CONNECTED;
+                       vscsi->flags &= ~DISCONNECT_SCHEDULED;
+               }
+
+               pr_debug("adapter_idle, wait: flags 0x%x, state 0x%hx\n",
+                        vscsi->flags, vscsi->state);
+               ibmvscsis_poll_cmd_q(vscsi);
+               break;
+
+       case ERR_DISCONNECTED:
+               vscsi->flags &= ~DISCONNECT_SCHEDULED;
+               pr_debug("adapter_idle, disconnected: flags 0x%x, state 0x%hx\n",
+                        vscsi->flags, vscsi->state);
+               break;
+
+       default:
+               dev_err(&vscsi->dev, "adapter_idle: in invalid state %d\n",
+                       vscsi->state);
+               break;
+       }
+
+       if (free_qs)
+               ibmvscsis_free_cmd_qs(vscsi);
+
+       /*
+        * There is a timing window where we could lose a disconnect request.
+        * The known path to this window occurs during the DISCONNECT_RECONNECT
+        * case above: reset_queue calls free_command_q, which will release the
+        * interrupt lock.  During that time, a new post_disconnect call can be
+        * made with a "more severe" state (DISCONNECT or UNCONFIGURING).
+        * Because the DISCONNECT_SCHEDULED flag is already set, post_disconnect
+        * will only set the new_state.  Now free_command_q reacquires the intr
+        * lock and clears the DISCONNECT_SCHEDULED flag (using PRESERVE_FLAG_
+        * FIELDS), and the disconnect is lost.  This is particularly bad when
+        * the new disconnect was for UNCONFIGURING, since the unconfigure hangs
+        * forever.
+        * Fix is that free command queue sets acr state and acr flags if there
+        * is a change under the lock
+        * note free command queue writes to this state it clears it
+        * before releasing the lock, different drivers call the free command
+        * queue different times so dont initialize above
+        */
+       if (vscsi->phyp_acr_state != 0) {
+               /*
+                * set any bits in flags that may have been cleared by
+                * a call to free command queue in switch statement
+                * or reset queue
+                */
+               vscsi->flags |= vscsi->phyp_acr_flags;
+               ibmvscsis_post_disconnect(vscsi, vscsi->phyp_acr_state, 0);
+               vscsi->phyp_acr_state = 0;
+               vscsi->phyp_acr_flags = 0;
+
+               pr_debug("adapter_idle: flags 0x%x, state 0x%hx, acr_flags 0x%x, acr_state 0x%hx\n",
+                        vscsi->flags, vscsi->state, vscsi->phyp_acr_flags,
+                        vscsi->phyp_acr_state);
+       }
+
+       pr_debug("Leaving adapter_idle: flags 0x%x, state 0x%hx, new_state 0x%x\n",
+                vscsi->flags, vscsi->state, vscsi->new_state);
+}
+
+/**
+ * ibmvscsis_copy_crq_packet() - Copy CRQ Packet
+ * @vscsi:     Pointer to our adapter structure
+ * @cmd:       Pointer to command element to use to process the request
+ * @crq:       Pointer to CRQ entry containing the request
+ *
+ * Copy the srp information unit from the hosted
+ * partition using remote dma
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt, interrupt lock held
+ */
+static long ibmvscsis_copy_crq_packet(struct scsi_info *vscsi,
+                                     struct ibmvscsis_cmd *cmd,
+                                     struct viosrp_crq *crq)
+{
+       struct iu_entry *iue = cmd->iue;
+       long rc = 0;
+       u16 len;
+
+       len = be16_to_cpu(crq->IU_length);
+       if ((len > SRP_MAX_IU_LEN) || (len == 0)) {
+               dev_err(&vscsi->dev, "copy_crq: Invalid len %d passed", len);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+               return SRP_VIOLATION;
+       }
+
+       rc = h_copy_rdma(len, vscsi->dds.window[REMOTE].liobn,
+                        be64_to_cpu(crq->IU_data_ptr),
+                        vscsi->dds.window[LOCAL].liobn, iue->sbuf->dma);
+
+       switch (rc) {
+       case H_SUCCESS:
+               cmd->init_time = mftb();
+               iue->remote_token = crq->IU_data_ptr;
+               iue->iu_len = len;
+               pr_debug("copy_crq: ioba 0x%llx, init_time 0x%llx\n",
+                        be64_to_cpu(crq->IU_data_ptr), cmd->init_time);
+               break;
+       case H_PERMISSION:
+               if (connection_broken(vscsi))
+                       ibmvscsis_post_disconnect(vscsi,
+                                                 ERR_DISCONNECT_RECONNECT,
+                                                 (RESPONSE_Q_DOWN |
+                                                  CLIENT_FAILED));
+               else
+                       ibmvscsis_post_disconnect(vscsi,
+                                                 ERR_DISCONNECT_RECONNECT, 0);
+
+               dev_err(&vscsi->dev, "copy_crq: h_copy_rdma failed, rc %ld\n",
+                       rc);
+               break;
+       case H_DEST_PARM:
+       case H_SOURCE_PARM:
+       default:
+               dev_err(&vscsi->dev, "copy_crq: h_copy_rdma failed, rc %ld\n",
+                       rc);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+               break;
+       }
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_adapter_info - Service an Adapter Info MAnagement Data gram
+ * @vscsi:     Pointer to our adapter structure
+ * @iue:       Information Unit containing the Adapter Info MAD request
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt adpater lock is held
+ */
+static long ibmvscsis_adapter_info(struct scsi_info *vscsi,
+                                  struct iu_entry *iue)
+{
+       struct viosrp_adapter_info *mad = &vio_iu(iue)->mad.adapter_info;
+       struct mad_adapter_info_data *info;
+       uint flag_bits = 0;
+       dma_addr_t token;
+       long rc;
+
+       mad->common.status = cpu_to_be16(VIOSRP_MAD_SUCCESS);
+
+       if (be16_to_cpu(mad->common.length) > sizeof(*info)) {
+               mad->common.status = cpu_to_be16(VIOSRP_MAD_FAILED);
+               return 0;
+       }
+
+       info = dma_alloc_coherent(&vscsi->dma_dev->dev, sizeof(*info), &token,
+                                 GFP_KERNEL);
+       if (!info) {
+               dev_err(&vscsi->dev, "bad dma_alloc_coherent %p\n",
+                       iue->target);
+               mad->common.status = cpu_to_be16(VIOSRP_MAD_FAILED);
+               return 0;
+       }
+
+       /* Get remote info */
+       rc = h_copy_rdma(be16_to_cpu(mad->common.length),
+                        vscsi->dds.window[REMOTE].liobn,
+                        be64_to_cpu(mad->buffer),
+                        vscsi->dds.window[LOCAL].liobn, token);
+
+       if (rc != H_SUCCESS) {
+               if (rc == H_PERMISSION) {
+                       if (connection_broken(vscsi))
+                               flag_bits = (RESPONSE_Q_DOWN | CLIENT_FAILED);
+               }
+               pr_warn("adapter_info: h_copy_rdma from client failed, rc %ld\n",
+                       rc);
+               pr_debug("adapter_info: ioba 0x%llx, flags 0x%x, flag_bits 0x%x\n",
+                        be64_to_cpu(mad->buffer), vscsi->flags, flag_bits);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT,
+                                         flag_bits);
+               goto free_dma;
+       }
+
+       /*
+        * Copy client info, but ignore partition number, which we
+        * already got from phyp - unless we failed to get it from
+        * phyp (e.g. if we're running on a p5 system).
+        */
+       if (vscsi->client_data.partition_number == 0)
+               vscsi->client_data.partition_number =
+                       be32_to_cpu(info->partition_number);
+       strncpy(vscsi->client_data.srp_version, info->srp_version,
+               sizeof(vscsi->client_data.srp_version));
+       strncpy(vscsi->client_data.partition_name, info->partition_name,
+               sizeof(vscsi->client_data.partition_name));
+       vscsi->client_data.mad_version = be32_to_cpu(info->mad_version);
+       vscsi->client_data.os_type = be32_to_cpu(info->os_type);
+
+       /* Copy our info */
+       strncpy(info->srp_version, SRP_VERSION,
+               sizeof(info->srp_version));
+       strncpy(info->partition_name, vscsi->dds.partition_name,
+               sizeof(info->partition_name));
+       info->partition_number = cpu_to_be32(vscsi->dds.partition_num);
+       info->mad_version = cpu_to_be32(MAD_VERSION_1);
+       info->os_type = cpu_to_be32(LINUX);
+       memset(&info->port_max_txu[0], 0, sizeof(info->port_max_txu));
+       info->port_max_txu[0] = cpu_to_be32(128 * PAGE_SIZE);
+
+       dma_wmb();
+       rc = h_copy_rdma(sizeof(*info), vscsi->dds.window[LOCAL].liobn,
+                        token, vscsi->dds.window[REMOTE].liobn,
+                        be64_to_cpu(mad->buffer));
+       switch (rc) {
+       case H_SUCCESS:
+               break;
+
+       case H_SOURCE_PARM:
+       case H_DEST_PARM:
+       case H_PERMISSION:
+               if (connection_broken(vscsi))
+                       flag_bits = (RESPONSE_Q_DOWN | CLIENT_FAILED);
+       default:
+               dev_err(&vscsi->dev, "adapter_info: h_copy_rdma to client failed, rc %ld\n",
+                       rc);
+               ibmvscsis_post_disconnect(vscsi,
+                                         ERR_DISCONNECT_RECONNECT,
+                                         flag_bits);
+               break;
+       }
+
+free_dma:
+       dma_free_coherent(&vscsi->dma_dev->dev, sizeof(*info), info, token);
+       pr_debug("Leaving adapter_info, rc %ld\n", rc);
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_cap_mad() - Service a Capabilities MAnagement Data gram
+ * @vscsi:     Pointer to our adapter structure
+ * @iue:       Information Unit containing the Capabilities MAD request
+ *
+ * NOTE: if you return an error from this routine you must be
+ * disconnecting or you will cause a hang
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt called with adapter lock held
+ */
+static int ibmvscsis_cap_mad(struct scsi_info *vscsi, struct iu_entry *iue)
+{
+       struct viosrp_capabilities *mad = &vio_iu(iue)->mad.capabilities;
+       struct capabilities *cap;
+       struct mad_capability_common *common;
+       dma_addr_t token;
+       u16 olen, len, status, min_len, cap_len;
+       u32 flag;
+       uint flag_bits = 0;
+       long rc = 0;
+
+       olen = be16_to_cpu(mad->common.length);
+       /*
+        * struct capabilities hardcodes a couple capabilities after the
+        * header, but the capabilities can actually be in any order.
+        */
+       min_len = offsetof(struct capabilities, migration);
+       if ((olen < min_len) || (olen > PAGE_SIZE)) {
+               pr_warn("cap_mad: invalid len %d\n", olen);
+               mad->common.status = cpu_to_be16(VIOSRP_MAD_FAILED);
+               return 0;
+       }
+
+       cap = dma_alloc_coherent(&vscsi->dma_dev->dev, olen, &token,
+                                GFP_KERNEL);
+       if (!cap) {
+               dev_err(&vscsi->dev, "bad dma_alloc_coherent %p\n",
+                       iue->target);
+               mad->common.status = cpu_to_be16(VIOSRP_MAD_FAILED);
+               return 0;
+       }
+       rc = h_copy_rdma(olen, vscsi->dds.window[REMOTE].liobn,
+                        be64_to_cpu(mad->buffer),
+                        vscsi->dds.window[LOCAL].liobn, token);
+       if (rc == H_SUCCESS) {
+               strncpy(cap->name, dev_name(&vscsi->dma_dev->dev),
+                       SRP_MAX_LOC_LEN);
+
+               len = olen - min_len;
+               status = VIOSRP_MAD_SUCCESS;
+               common = (struct mad_capability_common *)&cap->migration;
+
+               while ((len > 0) && (status == VIOSRP_MAD_SUCCESS) && !rc) {
+                       pr_debug("cap_mad: len left %hd, cap type %d, cap len %hd\n",
+                                len, be32_to_cpu(common->cap_type),
+                                be16_to_cpu(common->length));
+
+                       cap_len = be16_to_cpu(common->length);
+                       if (cap_len > len) {
+                               dev_err(&vscsi->dev, "cap_mad: cap len mismatch with total len\n");
+                               status = VIOSRP_MAD_FAILED;
+                               break;
+                       }
+
+                       if (cap_len == 0) {
+                               dev_err(&vscsi->dev, "cap_mad: cap len is 0\n");
+                               status = VIOSRP_MAD_FAILED;
+                               break;
+                       }
+
+                       switch (common->cap_type) {
+                       default:
+                               pr_debug("cap_mad: unsupported capability\n");
+                               common->server_support = 0;
+                               flag = cpu_to_be32((u32)CAP_LIST_SUPPORTED);
+                               cap->flags &= ~flag;
+                               break;
+                       }
+
+                       len = len - cap_len;
+                       common = (struct mad_capability_common *)
+                               ((char *)common + cap_len);
+               }
+
+               mad->common.status = cpu_to_be16(status);
+
+               dma_wmb();
+               rc = h_copy_rdma(olen, vscsi->dds.window[LOCAL].liobn, token,
+                                vscsi->dds.window[REMOTE].liobn,
+                                be64_to_cpu(mad->buffer));
+
+               if (rc != H_SUCCESS) {
+                       pr_debug("cap_mad: failed to copy to client, rc %ld\n",
+                                rc);
+
+                       if (rc == H_PERMISSION) {
+                               if (connection_broken(vscsi))
+                                       flag_bits = (RESPONSE_Q_DOWN |
+                                                    CLIENT_FAILED);
+                       }
+
+                       pr_warn("cap_mad: error copying data to client, rc %ld\n",
+                               rc);
+                       ibmvscsis_post_disconnect(vscsi,
+                                                 ERR_DISCONNECT_RECONNECT,
+                                                 flag_bits);
+               }
+       }
+
+       dma_free_coherent(&vscsi->dma_dev->dev, olen, cap, token);
+
+       pr_debug("Leaving cap_mad, rc %ld, client_cap 0x%x\n",
+                rc, vscsi->client_cap);
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_process_mad() - Service a MAnagement Data gram
+ * @vscsi:     Pointer to our adapter structure
+ * @iue:       Information Unit containing the MAD request
+ *
+ * Must be called with interrupt lock held.
+ */
+static long ibmvscsis_process_mad(struct scsi_info *vscsi, struct iu_entry *iue)
+{
+       struct mad_common *mad = (struct mad_common *)&vio_iu(iue)->mad;
+       struct viosrp_empty_iu *empty;
+       long rc = ADAPT_SUCCESS;
+
+       switch (be32_to_cpu(mad->type)) {
+       case VIOSRP_EMPTY_IU_TYPE:
+               empty = &vio_iu(iue)->mad.empty_iu;
+               vscsi->empty_iu_id = be64_to_cpu(empty->buffer);
+               vscsi->empty_iu_tag = be64_to_cpu(empty->common.tag);
+               mad->status = cpu_to_be16(VIOSRP_MAD_SUCCESS);
+               break;
+       case VIOSRP_ADAPTER_INFO_TYPE:
+               rc = ibmvscsis_adapter_info(vscsi, iue);
+               break;
+       case VIOSRP_CAPABILITIES_TYPE:
+               rc = ibmvscsis_cap_mad(vscsi, iue);
+               break;
+       case VIOSRP_ENABLE_FAST_FAIL:
+               if (vscsi->state == CONNECTED) {
+                       vscsi->fast_fail = true;
+                       mad->status = cpu_to_be16(VIOSRP_MAD_SUCCESS);
+               } else {
+                       pr_warn("fast fail mad sent after login\n");
+                       mad->status = cpu_to_be16(VIOSRP_MAD_FAILED);
+               }
+               break;
+       default:
+               mad->status = cpu_to_be16(VIOSRP_MAD_NOT_SUPPORTED);
+               break;
+       }
+
+       return rc;
+}
+
+/**
+ * srp_snd_msg_failed() - Handle an error when sending a response
+ * @vscsi:     Pointer to our adapter structure
+ * @rc:                The return code from the h_send_crq command
+ *
+ * Must be called with interrupt lock held.
+ */
+static void srp_snd_msg_failed(struct scsi_info *vscsi, long rc)
+{
+       ktime_t kt;
+
+       if (rc != H_DROPPED) {
+               ibmvscsis_free_cmd_qs(vscsi);
+
+               if (rc == H_CLOSED)
+                       vscsi->flags |= CLIENT_FAILED;
+
+               /* don't flag the same problem multiple times */
+               if (!(vscsi->flags & RESPONSE_Q_DOWN)) {
+                       vscsi->flags |= RESPONSE_Q_DOWN;
+                       if (!(vscsi->state & (ERR_DISCONNECT |
+                                             ERR_DISCONNECT_RECONNECT |
+                                             ERR_DISCONNECTED | UNDEFINED))) {
+                               dev_err(&vscsi->dev, "snd_msg_failed: setting RESPONSE_Q_DOWN, state 0x%hx, flags 0x%x, rc %ld\n",
+                                       vscsi->state, vscsi->flags, rc);
+                       }
+                       ibmvscsis_post_disconnect(vscsi,
+                                                 ERR_DISCONNECT_RECONNECT, 0);
+               }
+               return;
+       }
+
+       /*
+        * The response queue is full.
+        * If the server is processing SRP requests, i.e.
+        * the client has successfully done an
+        * SRP_LOGIN, then it will wait forever for room in
+        * the queue.  However if the system admin
+        * is attempting to unconfigure the server then one
+        * or more children will be in a state where
+        * they are being removed. So if there is even one
+        * child being removed then the driver assumes
+        * the system admin is attempting to break the
+        * connection with the client and MAX_TIMER_POPS
+        * is honored.
+        */
+       if ((vscsi->rsp_q_timer.timer_pops < MAX_TIMER_POPS) ||
+           (vscsi->state == SRP_PROCESSING)) {
+               pr_debug("snd_msg_failed: response queue full, flags 0x%x, timer started %d, pops %d\n",
+                        vscsi->flags, (int)vscsi->rsp_q_timer.started,
+                        vscsi->rsp_q_timer.timer_pops);
+
+               /*
+                * Check if the timer is running; if it
+                * is not then start it up.
+                */
+               if (!vscsi->rsp_q_timer.started) {
+                       if (vscsi->rsp_q_timer.timer_pops <
+                           MAX_TIMER_POPS) {
+                               kt = ktime_set(0, WAIT_NANO_SECONDS);
+                       } else {
+                               /*
+                                * slide the timeslice if the maximum
+                                * timer pops have already happened
+                                */
+                               kt = ktime_set(WAIT_SECONDS, 0);
+                       }
+
+                       vscsi->rsp_q_timer.started = true;
+                       hrtimer_start(&vscsi->rsp_q_timer.timer, kt,
+                                     HRTIMER_MODE_REL);
+               }
+       } else {
+               /*
+                * TBD: Do we need to worry about this? Need to get
+                *      remove working.
+                */
+               /*
+                * waited a long time and it appears the system admin
+                * is bring this driver down
+                */
+               vscsi->flags |= RESPONSE_Q_DOWN;
+               ibmvscsis_free_cmd_qs(vscsi);
+               /*
+                * if the driver is already attempting to disconnect
+                * from the client and has already logged an error
+                * trace this event but don't put it in the error log
+                */
+               if (!(vscsi->state & (ERR_DISCONNECT |
+                                     ERR_DISCONNECT_RECONNECT |
+                                     ERR_DISCONNECTED | UNDEFINED))) {
+                       dev_err(&vscsi->dev, "client crq full too long\n");
+                       ibmvscsis_post_disconnect(vscsi,
+                                                 ERR_DISCONNECT_RECONNECT,
+                                                 0);
+               }
+       }
+}
+
+/**
+ * ibmvscsis_send_messages() - Send a Response
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * Send a response, first checking the waiting queue. Responses are
+ * sent in order they are received. If the response cannot be sent,
+ * because the client queue is full, it stays on the waiting queue.
+ *
+ * PRECONDITION:
+ *     Called with interrupt lock held
+ */
+static void ibmvscsis_send_messages(struct scsi_info *vscsi)
+{
+       u64 msg_hi = 0;
+       /* note do not attmempt to access the IU_data_ptr with this pointer
+        * it is not valid
+        */
+       struct viosrp_crq *crq = (struct viosrp_crq *)&msg_hi;
+       struct ibmvscsis_cmd *cmd, *nxt;
+       struct iu_entry *iue;
+       long rc = ADAPT_SUCCESS;
+
+       if (!(vscsi->flags & RESPONSE_Q_DOWN)) {
+               list_for_each_entry_safe(cmd, nxt, &vscsi->waiting_rsp, list) {
+                       pr_debug("send_messages cmd %p\n", cmd);
+
+                       iue = cmd->iue;
+
+                       crq->valid = VALID_CMD_RESP_EL;
+                       crq->format = cmd->rsp.format;
+
+                       if (cmd->flags & CMD_FAST_FAIL)
+                               crq->status = VIOSRP_ADAPTER_FAIL;
+
+                       crq->IU_length = cpu_to_be16(cmd->rsp.len);
+
+                       rc = h_send_crq(vscsi->dma_dev->unit_address,
+                                       be64_to_cpu(msg_hi),
+                                       be64_to_cpu(cmd->rsp.tag));
+
+                       pr_debug("send_messages: tag 0x%llx, rc %ld\n",
+                                be64_to_cpu(cmd->rsp.tag), rc);
+
+                       /* if all ok free up the command element resources */
+                       if (rc == H_SUCCESS) {
+                               /* some movement has occurred */
+                               vscsi->rsp_q_timer.timer_pops = 0;
+                               list_del(&cmd->list);
+
+                               ibmvscsis_free_cmd_resources(vscsi, cmd);
+                       } else {
+                               srp_snd_msg_failed(vscsi, rc);
+                               break;
+                       }
+               }
+
+               if (!rc) {
+                       /*
+                        * The timer could pop with the queue empty.  If
+                        * this happens, rc will always indicate a
+                        * success; clear the pop count.
+                        */
+                       vscsi->rsp_q_timer.timer_pops = 0;
+               }
+       } else {
+               ibmvscsis_free_cmd_qs(vscsi);
+       }
+}
+
+/* Called with intr lock held */
+static void ibmvscsis_send_mad_resp(struct scsi_info *vscsi,
+                                   struct ibmvscsis_cmd *cmd,
+                                   struct viosrp_crq *crq)
+{
+       struct iu_entry *iue = cmd->iue;
+       struct mad_common *mad = (struct mad_common *)&vio_iu(iue)->mad;
+       uint flag_bits = 0;
+       long rc;
+
+       dma_wmb();
+       rc = h_copy_rdma(sizeof(struct mad_common),
+                        vscsi->dds.window[LOCAL].liobn, iue->sbuf->dma,
+                        vscsi->dds.window[REMOTE].liobn,
+                        be64_to_cpu(crq->IU_data_ptr));
+       if (!rc) {
+               cmd->rsp.format = VIOSRP_MAD_FORMAT;
+               cmd->rsp.len = sizeof(struct mad_common);
+               cmd->rsp.tag = mad->tag;
+               list_add_tail(&cmd->list, &vscsi->waiting_rsp);
+               ibmvscsis_send_messages(vscsi);
+       } else {
+               pr_debug("Error sending mad response, rc %ld\n", rc);
+               if (rc == H_PERMISSION) {
+                       if (connection_broken(vscsi))
+                               flag_bits = (RESPONSE_Q_DOWN | CLIENT_FAILED);
+               }
+               dev_err(&vscsi->dev, "mad: failed to copy to client, rc %ld\n",
+                       rc);
+
+               ibmvscsis_free_cmd_resources(vscsi, cmd);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT,
+                                         flag_bits);
+       }
+}
+
+/**
+ * ibmvscsis_mad() - Service a MAnagement Data gram.
+ * @vscsi:     Pointer to our adapter structure
+ * @crq:       Pointer to the CRQ entry containing the MAD request
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt  called with adapter lock held
+ */
+static long ibmvscsis_mad(struct scsi_info *vscsi, struct viosrp_crq *crq)
+{
+       struct iu_entry *iue;
+       struct ibmvscsis_cmd *cmd;
+       struct mad_common *mad;
+       long rc = ADAPT_SUCCESS;
+
+       switch (vscsi->state) {
+               /*
+                * We have not exchanged Init Msgs yet, so this MAD was sent
+                * before the last Transport Event; client will not be
+                * expecting a response.
+                */
+       case WAIT_CONNECTION:
+               pr_debug("mad: in Wait Connection state, ignoring MAD, flags %d\n",
+                        vscsi->flags);
+               return ADAPT_SUCCESS;
+
+       case SRP_PROCESSING:
+       case CONNECTED:
+               break;
+
+               /*
+                * We should never get here while we're in these states.
+                * Just log an error and get out.
+                */
+       case UNCONFIGURING:
+       case WAIT_IDLE:
+       case ERR_DISCONNECT:
+       case ERR_DISCONNECT_RECONNECT:
+       default:
+               dev_err(&vscsi->dev, "mad: invalid adapter state %d for mad\n",
+                       vscsi->state);
+               return ADAPT_SUCCESS;
+       }
+
+       cmd = ibmvscsis_get_free_cmd(vscsi);
+       if (!cmd) {
+               dev_err(&vscsi->dev, "mad: failed to get cmd, debit %d\n",
+                       vscsi->debit);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+               return ERROR;
+       }
+       iue = cmd->iue;
+       cmd->type = ADAPTER_MAD;
+
+       rc = ibmvscsis_copy_crq_packet(vscsi, cmd, crq);
+       if (!rc) {
+               mad = (struct mad_common *)&vio_iu(iue)->mad;
+
+               pr_debug("mad: type %d\n", be32_to_cpu(mad->type));
+
+               if (be16_to_cpu(mad->length) < 0) {
+                       dev_err(&vscsi->dev, "mad: length is < 0\n");
+                       ibmvscsis_post_disconnect(vscsi,
+                                                 ERR_DISCONNECT_RECONNECT, 0);
+                       rc = SRP_VIOLATION;
+               } else {
+                       rc = ibmvscsis_process_mad(vscsi, iue);
+               }
+
+               pr_debug("mad: status %hd, rc %ld\n", be16_to_cpu(mad->status),
+                        rc);
+
+               if (!rc)
+                       ibmvscsis_send_mad_resp(vscsi, cmd, crq);
+       } else {
+               ibmvscsis_free_cmd_resources(vscsi, cmd);
+       }
+
+       pr_debug("Leaving mad, rc %ld\n", rc);
+       return rc;
+}
+
+/**
+ * ibmvscsis_login_rsp() - Create/copy a login response notice to the client
+ * @vscsi:     Pointer to our adapter structure
+ * @cmd:       Pointer to the command for the SRP Login request
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt, interrupt lock held
+ */
+static long ibmvscsis_login_rsp(struct scsi_info *vscsi,
+                               struct ibmvscsis_cmd *cmd)
+{
+       struct iu_entry *iue = cmd->iue;
+       struct srp_login_rsp *rsp = &vio_iu(iue)->srp.login_rsp;
+       struct format_code *fmt;
+       uint flag_bits = 0;
+       long rc = ADAPT_SUCCESS;
+
+       memset(rsp, 0, sizeof(struct srp_login_rsp));
+
+       rsp->opcode = SRP_LOGIN_RSP;
+       rsp->req_lim_delta = cpu_to_be32(vscsi->request_limit);
+       rsp->tag = cmd->rsp.tag;
+       rsp->max_it_iu_len = cpu_to_be32(SRP_MAX_IU_LEN);
+       rsp->max_ti_iu_len = cpu_to_be32(SRP_MAX_IU_LEN);
+       fmt = (struct format_code *)&rsp->buf_fmt;
+       fmt->buffers = SUPPORTED_FORMATS;
+       vscsi->credit = 0;
+
+       cmd->rsp.len = sizeof(struct srp_login_rsp);
+
+       dma_wmb();
+       rc = h_copy_rdma(cmd->rsp.len, vscsi->dds.window[LOCAL].liobn,
+                        iue->sbuf->dma, vscsi->dds.window[REMOTE].liobn,
+                        be64_to_cpu(iue->remote_token));
+
+       switch (rc) {
+       case H_SUCCESS:
+               break;
+
+       case H_PERMISSION:
+               if (connection_broken(vscsi))
+                       flag_bits = RESPONSE_Q_DOWN | CLIENT_FAILED;
+               dev_err(&vscsi->dev, "login_rsp: error copying to client, rc %ld\n",
+                       rc);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT,
+                                         flag_bits);
+               break;
+       case H_SOURCE_PARM:
+       case H_DEST_PARM:
+       default:
+               dev_err(&vscsi->dev, "login_rsp: error copying to client, rc %ld\n",
+                       rc);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+               break;
+       }
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_srp_login_rej() - Create/copy a login rejection notice to client
+ * @vscsi:     Pointer to our adapter structure
+ * @cmd:       Pointer to the command for the SRP Login request
+ * @reason:    The reason the SRP Login is being rejected, per SRP protocol
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt, interrupt lock held
+ */
+static long ibmvscsis_srp_login_rej(struct scsi_info *vscsi,
+                                   struct ibmvscsis_cmd *cmd, u32 reason)
+{
+       struct iu_entry *iue = cmd->iue;
+       struct srp_login_rej *rej = &vio_iu(iue)->srp.login_rej;
+       struct format_code *fmt;
+       uint flag_bits = 0;
+       long rc = ADAPT_SUCCESS;
+
+       memset(rej, 0, sizeof(*rej));
+
+       rej->opcode = SRP_LOGIN_REJ;
+       rej->reason = cpu_to_be32(reason);
+       rej->tag = cmd->rsp.tag;
+       fmt = (struct format_code *)&rej->buf_fmt;
+       fmt->buffers = SUPPORTED_FORMATS;
+
+       cmd->rsp.len = sizeof(*rej);
+
+       dma_wmb();
+       rc = h_copy_rdma(cmd->rsp.len, vscsi->dds.window[LOCAL].liobn,
+                        iue->sbuf->dma, vscsi->dds.window[REMOTE].liobn,
+                        be64_to_cpu(iue->remote_token));
+
+       switch (rc) {
+       case H_SUCCESS:
+               break;
+       case H_PERMISSION:
+               if (connection_broken(vscsi))
+                       flag_bits =  RESPONSE_Q_DOWN | CLIENT_FAILED;
+               dev_err(&vscsi->dev, "login_rej: error copying to client, rc %ld\n",
+                       rc);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT,
+                                         flag_bits);
+               break;
+       case H_SOURCE_PARM:
+       case H_DEST_PARM:
+       default:
+               dev_err(&vscsi->dev, "login_rej: error copying to client, rc %ld\n",
+                       rc);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+               break;
+       }
+
+       return rc;
+}
+
+static int ibmvscsis_make_nexus(struct ibmvscsis_tport *tport)
+{
+       char *name = tport->tport_name;
+       struct ibmvscsis_nexus *nexus;
+       int rc;
+
+       if (tport->ibmv_nexus) {
+               pr_debug("tport->ibmv_nexus already exists\n");
+               return 0;
+       }
+
+       nexus = kzalloc(sizeof(*nexus), GFP_KERNEL);
+       if (!nexus) {
+               pr_err("Unable to allocate struct ibmvscsis_nexus\n");
+               return -ENOMEM;
+       }
+
+       nexus->se_sess = target_alloc_session(&tport->se_tpg, 0, 0,
+                                             TARGET_PROT_NORMAL, name, nexus,
+                                             NULL);
+       if (IS_ERR(nexus->se_sess)) {
+               rc = PTR_ERR(nexus->se_sess);
+               goto transport_init_fail;
+       }
+
+       tport->ibmv_nexus = nexus;
+
+       return 0;
+
+transport_init_fail:
+       kfree(nexus);
+       return rc;
+}
+
+static int ibmvscsis_drop_nexus(struct ibmvscsis_tport *tport)
+{
+       struct se_session *se_sess;
+       struct ibmvscsis_nexus *nexus;
+
+       nexus = tport->ibmv_nexus;
+       if (!nexus)
+               return -ENODEV;
+
+       se_sess = nexus->se_sess;
+       if (!se_sess)
+               return -ENODEV;
+
+       /*
+        * Release the SCSI I_T Nexus to the emulated ibmvscsis Target Port
+        */
+       transport_deregister_session(se_sess);
+       tport->ibmv_nexus = NULL;
+       kfree(nexus);
+
+       return 0;
+}
+
+/**
+ * ibmvscsis_srp_login() - Process an SRP Login Request
+ * @vscsi:     Pointer to our adapter structure
+ * @cmd:       Command element to use to process the SRP Login request
+ * @crq:       Pointer to CRQ entry containing the SRP Login request
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt, called with interrupt lock held
+ */
+static long ibmvscsis_srp_login(struct scsi_info *vscsi,
+                               struct ibmvscsis_cmd *cmd,
+                               struct viosrp_crq *crq)
+{
+       struct iu_entry *iue = cmd->iue;
+       struct srp_login_req *req = &vio_iu(iue)->srp.login_req;
+       struct port_id {
+               __be64 id_extension;
+               __be64 io_guid;
+       } *iport, *tport;
+       struct format_code *fmt;
+       u32 reason = 0x0;
+       long rc = ADAPT_SUCCESS;
+
+       iport = (struct port_id *)req->initiator_port_id;
+       tport = (struct port_id *)req->target_port_id;
+       fmt = (struct format_code *)&req->req_buf_fmt;
+       if (be32_to_cpu(req->req_it_iu_len) > SRP_MAX_IU_LEN)
+               reason = SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE;
+       else if (be32_to_cpu(req->req_it_iu_len) < 64)
+               reason = SRP_LOGIN_REJ_UNABLE_ESTABLISH_CHANNEL;
+       else if ((be64_to_cpu(iport->id_extension) > (MAX_NUM_PORTS - 1)) ||
+                (be64_to_cpu(tport->id_extension) > (MAX_NUM_PORTS - 1)))
+               reason = SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL;
+       else if (req->req_flags & SRP_MULTICHAN_MULTI)
+               reason = SRP_LOGIN_REJ_MULTI_CHANNEL_UNSUPPORTED;
+       else if (fmt->buffers & (~SUPPORTED_FORMATS))
+               reason = SRP_LOGIN_REJ_UNSUPPORTED_DESCRIPTOR_FMT;
+       else if ((fmt->buffers | SUPPORTED_FORMATS) == 0)
+               reason = SRP_LOGIN_REJ_UNSUPPORTED_DESCRIPTOR_FMT;
+
+       if (vscsi->state == SRP_PROCESSING)
+               reason = SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED;
+
+       rc = ibmvscsis_make_nexus(&vscsi->tport);
+       if (rc)
+               reason = SRP_LOGIN_REJ_UNABLE_ESTABLISH_CHANNEL;
+
+       cmd->rsp.format = VIOSRP_SRP_FORMAT;
+       cmd->rsp.tag = req->tag;
+
+       pr_debug("srp_login: reason 0x%x\n", reason);
+
+       if (reason)
+               rc = ibmvscsis_srp_login_rej(vscsi, cmd, reason);
+       else
+               rc = ibmvscsis_login_rsp(vscsi, cmd);
+
+       if (!rc) {
+               if (!reason)
+                       vscsi->state = SRP_PROCESSING;
+
+               list_add_tail(&cmd->list, &vscsi->waiting_rsp);
+               ibmvscsis_send_messages(vscsi);
+       } else {
+               ibmvscsis_free_cmd_resources(vscsi, cmd);
+       }
+
+       pr_debug("Leaving srp_login, rc %ld\n", rc);
+       return rc;
+}
+
+/**
+ * ibmvscsis_srp_i_logout() - Helper Function to close I_T Nexus
+ * @vscsi:     Pointer to our adapter structure
+ * @cmd:       Command element to use to process the Implicit Logout request
+ * @crq:       Pointer to CRQ entry containing the Implicit Logout request
+ *
+ * Do the logic to close the I_T nexus.  This function may not
+ * behave to specification.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt, interrupt lock held
+ */
+static long ibmvscsis_srp_i_logout(struct scsi_info *vscsi,
+                                  struct ibmvscsis_cmd *cmd,
+                                  struct viosrp_crq *crq)
+{
+       struct iu_entry *iue = cmd->iue;
+       struct srp_i_logout *log_out = &vio_iu(iue)->srp.i_logout;
+       long rc = ADAPT_SUCCESS;
+
+       if ((vscsi->debit > 0) || !list_empty(&vscsi->schedule_q) ||
+           !list_empty(&vscsi->waiting_rsp)) {
+               dev_err(&vscsi->dev, "i_logout: outstanding work\n");
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT, 0);
+       } else {
+               cmd->rsp.format = SRP_FORMAT;
+               cmd->rsp.tag = log_out->tag;
+               cmd->rsp.len = sizeof(struct mad_common);
+               list_add_tail(&cmd->list, &vscsi->waiting_rsp);
+               ibmvscsis_send_messages(vscsi);
+
+               ibmvscsis_post_disconnect(vscsi, WAIT_IDLE, 0);
+       }
+
+       return rc;
+}
+
+/* Called with intr lock held */
+static void ibmvscsis_srp_cmd(struct scsi_info *vscsi, struct viosrp_crq *crq)
+{
+       struct ibmvscsis_cmd *cmd;
+       struct iu_entry *iue;
+       struct srp_cmd *srp;
+       struct srp_tsk_mgmt *tsk;
+       long rc;
+
+       if (vscsi->request_limit - vscsi->debit <= 0) {
+               /* Client has exceeded request limit */
+               dev_err(&vscsi->dev, "Client exceeded the request limit (%d), debit %d\n",
+                       vscsi->request_limit, vscsi->debit);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+               return;
+       }
+
+       cmd = ibmvscsis_get_free_cmd(vscsi);
+       if (!cmd) {
+               dev_err(&vscsi->dev, "srp_cmd failed to get cmd, debit %d\n",
+                       vscsi->debit);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+               return;
+       }
+       iue = cmd->iue;
+       srp = &vio_iu(iue)->srp.cmd;
+
+       rc = ibmvscsis_copy_crq_packet(vscsi, cmd, crq);
+       if (rc) {
+               ibmvscsis_free_cmd_resources(vscsi, cmd);
+               return;
+       }
+
+       if (vscsi->state == SRP_PROCESSING) {
+               switch (srp->opcode) {
+               case SRP_LOGIN_REQ:
+                       rc = ibmvscsis_srp_login(vscsi, cmd, crq);
+                       break;
+
+               case SRP_TSK_MGMT:
+                       tsk = &vio_iu(iue)->srp.tsk_mgmt;
+                       pr_debug("tsk_mgmt tag: %llu (0x%llx)\n", tsk->tag,
+                                tsk->tag);
+                       cmd->rsp.tag = tsk->tag;
+                       vscsi->debit += 1;
+                       cmd->type = TASK_MANAGEMENT;
+                       list_add_tail(&cmd->list, &vscsi->schedule_q);
+                       queue_work(vscsi->work_q, &cmd->work);
+                       break;
+
+               case SRP_CMD:
+                       pr_debug("srp_cmd tag: %llu (0x%llx)\n", srp->tag,
+                                srp->tag);
+                       cmd->rsp.tag = srp->tag;
+                       vscsi->debit += 1;
+                       cmd->type = SCSI_CDB;
+                       /*
+                        * We want to keep track of work waiting for
+                        * the workqueue.
+                        */
+                       list_add_tail(&cmd->list, &vscsi->schedule_q);
+                       queue_work(vscsi->work_q, &cmd->work);
+                       break;
+
+               case SRP_I_LOGOUT:
+                       rc = ibmvscsis_srp_i_logout(vscsi, cmd, crq);
+                       break;
+
+               case SRP_CRED_RSP:
+               case SRP_AER_RSP:
+               default:
+                       ibmvscsis_free_cmd_resources(vscsi, cmd);
+                       dev_err(&vscsi->dev, "invalid srp cmd, opcode %d\n",
+                               (uint)srp->opcode);
+                       ibmvscsis_post_disconnect(vscsi,
+                                                 ERR_DISCONNECT_RECONNECT, 0);
+                       break;
+               }
+       } else if (srp->opcode == SRP_LOGIN_REQ && vscsi->state == CONNECTED) {
+               rc = ibmvscsis_srp_login(vscsi, cmd, crq);
+       } else {
+               ibmvscsis_free_cmd_resources(vscsi, cmd);
+               dev_err(&vscsi->dev, "Invalid state %d to handle srp cmd\n",
+                       vscsi->state);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+       }
+}
+
+/**
+ * ibmvscsis_ping_response() - Respond to a ping request
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * Let the client know that the server is alive and waiting on
+ * its native I/O stack.
+ * If any type of error occurs from the call to queue a ping
+ * response then the client is either not accepting or receiving
+ * interrupts.  Disconnect with an error.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt, interrupt lock held
+ */
+static long ibmvscsis_ping_response(struct scsi_info *vscsi)
+{
+       struct viosrp_crq *crq;
+       u64 buffer[2] = { 0, 0 };
+       long rc;
+
+       crq = (struct viosrp_crq *)&buffer;
+       crq->valid = VALID_CMD_RESP_EL;
+       crq->format = (u8)MESSAGE_IN_CRQ;
+       crq->status = PING_RESPONSE;
+
+       rc = h_send_crq(vscsi->dds.unit_id, cpu_to_be64(buffer[MSG_HI]),
+                       cpu_to_be64(buffer[MSG_LOW]));
+
+       switch (rc) {
+       case H_SUCCESS:
+               break;
+       case H_CLOSED:
+               vscsi->flags |= CLIENT_FAILED;
+       case H_DROPPED:
+               vscsi->flags |= RESPONSE_Q_DOWN;
+       case H_REMOTE_PARM:
+               dev_err(&vscsi->dev, "ping_response: h_send_crq failed, rc %ld\n",
+                       rc);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+               break;
+       default:
+               dev_err(&vscsi->dev, "ping_response: h_send_crq returned unknown rc %ld\n",
+                       rc);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT, 0);
+               break;
+       }
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_handle_init_compl_msg() - Respond to an Init Complete Message
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * Must be called with interrupt lock held.
+ */
+static long ibmvscsis_handle_init_compl_msg(struct scsi_info *vscsi)
+{
+       long rc = ADAPT_SUCCESS;
+
+       switch (vscsi->state) {
+       case NO_QUEUE:
+       case ERR_DISCONNECT:
+       case ERR_DISCONNECT_RECONNECT:
+       case ERR_DISCONNECTED:
+       case UNCONFIGURING:
+       case UNDEFINED:
+               rc = ERROR;
+               break;
+
+       case WAIT_CONNECTION:
+               vscsi->state = CONNECTED;
+               break;
+
+       case WAIT_IDLE:
+       case SRP_PROCESSING:
+       case CONNECTED:
+       case WAIT_ENABLED:
+       case PART_UP_WAIT_ENAB:
+       default:
+               rc = ERROR;
+               dev_err(&vscsi->dev, "init_msg: invalid state %d to get init compl msg\n",
+                       vscsi->state);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+               break;
+       }
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_handle_init_msg() - Respond to an Init Message
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * Must be called with interrupt lock held.
+ */
+static long ibmvscsis_handle_init_msg(struct scsi_info *vscsi)
+{
+       long rc = ADAPT_SUCCESS;
+
+       switch (vscsi->state) {
+       case WAIT_ENABLED:
+               vscsi->state = PART_UP_WAIT_ENAB;
+               break;
+
+       case WAIT_CONNECTION:
+               rc = ibmvscsis_send_init_message(vscsi, INIT_COMPLETE_MSG);
+               switch (rc) {
+               case H_SUCCESS:
+                       vscsi->state = CONNECTED;
+                       break;
+
+               case H_PARAMETER:
+                       dev_err(&vscsi->dev, "init_msg: failed to send, rc %ld\n",
+                               rc);
+                       ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT, 0);
+                       break;
+
+               case H_DROPPED:
+                       dev_err(&vscsi->dev, "init_msg: failed to send, rc %ld\n",
+                               rc);
+                       rc = ERROR;
+                       ibmvscsis_post_disconnect(vscsi,
+                                                 ERR_DISCONNECT_RECONNECT, 0);
+                       break;
+
+               case H_CLOSED:
+                       pr_warn("init_msg: failed to send, rc %ld\n", rc);
+                       rc = 0;
+                       break;
+               }
+               break;
+
+       case UNDEFINED:
+               rc = ERROR;
+               break;
+
+       case UNCONFIGURING:
+               break;
+
+       case PART_UP_WAIT_ENAB:
+       case CONNECTED:
+       case SRP_PROCESSING:
+       case WAIT_IDLE:
+       case NO_QUEUE:
+       case ERR_DISCONNECT:
+       case ERR_DISCONNECT_RECONNECT:
+       case ERR_DISCONNECTED:
+       default:
+               rc = ERROR;
+               dev_err(&vscsi->dev, "init_msg: invalid state %d to get init msg\n",
+                       vscsi->state);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+               break;
+       }
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_init_msg() - Respond to an init message
+ * @vscsi:     Pointer to our adapter structure
+ * @crq:       Pointer to CRQ element containing the Init Message
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt, interrupt lock held
+ */
+static long ibmvscsis_init_msg(struct scsi_info *vscsi, struct viosrp_crq *crq)
+{
+       long rc = ADAPT_SUCCESS;
+
+       pr_debug("init_msg: state 0x%hx\n", vscsi->state);
+
+       rc = h_vioctl(vscsi->dds.unit_id, H_GET_PARTNER_INFO,
+                     (u64)vscsi->map_ioba | ((u64)PAGE_SIZE << 32), 0, 0, 0,
+                     0);
+       if (rc == H_SUCCESS) {
+               vscsi->client_data.partition_number =
+                       be64_to_cpu(*(u64 *)vscsi->map_buf);
+               pr_debug("init_msg, part num %d\n",
+                        vscsi->client_data.partition_number);
+       } else {
+               pr_debug("init_msg h_vioctl rc %ld\n", rc);
+               rc = ADAPT_SUCCESS;
+       }
+
+       if (crq->format == INIT_MSG) {
+               rc = ibmvscsis_handle_init_msg(vscsi);
+       } else if (crq->format == INIT_COMPLETE_MSG) {
+               rc = ibmvscsis_handle_init_compl_msg(vscsi);
+       } else {
+               rc = ERROR;
+               dev_err(&vscsi->dev, "init_msg: invalid format %d\n",
+                       (uint)crq->format);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+       }
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_parse_command() - Parse an element taken from the cmd rsp queue.
+ * @vscsi:     Pointer to our adapter structure
+ * @crq:       Pointer to CRQ element containing the SRP request
+ *
+ * This function will return success if the command queue element is valid
+ * and the srp iu or MAD request it pointed to was also valid.  That does
+ * not mean that an error was not returned to the client.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Interrupt, intr lock held
+ */
+static long ibmvscsis_parse_command(struct scsi_info *vscsi,
+                                   struct viosrp_crq *crq)
+{
+       long rc = ADAPT_SUCCESS;
+
+       switch (crq->valid) {
+       case VALID_CMD_RESP_EL:
+               switch (crq->format) {
+               case OS400_FORMAT:
+               case AIX_FORMAT:
+               case LINUX_FORMAT:
+               case MAD_FORMAT:
+                       if (vscsi->flags & PROCESSING_MAD) {
+                               rc = ERROR;
+                               dev_err(&vscsi->dev, "parse_command: already processing mad\n");
+                               ibmvscsis_post_disconnect(vscsi,
+                                                      ERR_DISCONNECT_RECONNECT,
+                                                      0);
+                       } else {
+                               vscsi->flags |= PROCESSING_MAD;
+                               rc = ibmvscsis_mad(vscsi, crq);
+                       }
+                       break;
+
+               case SRP_FORMAT:
+                       ibmvscsis_srp_cmd(vscsi, crq);
+                       break;
+
+               case MESSAGE_IN_CRQ:
+                       if (crq->status == PING)
+                               ibmvscsis_ping_response(vscsi);
+                       break;
+
+               default:
+                       dev_err(&vscsi->dev, "parse_command: invalid format %d\n",
+                               (uint)crq->format);
+                       ibmvscsis_post_disconnect(vscsi,
+                                                 ERR_DISCONNECT_RECONNECT, 0);
+                       break;
+               }
+               break;
+
+       case VALID_TRANS_EVENT:
+               rc =  ibmvscsis_trans_event(vscsi, crq);
+               break;
+
+       case VALID_INIT_MSG:
+               rc = ibmvscsis_init_msg(vscsi, crq);
+               break;
+
+       default:
+               dev_err(&vscsi->dev, "parse_command: invalid valid field %d\n",
+                       (uint)crq->valid);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+               break;
+       }
+
+       /*
+        * Return only what the interrupt handler cares
+        * about. Most errors we keep right on trucking.
+        */
+       rc = vscsi->flags & SCHEDULE_DISCONNECT;
+
+       return rc;
+}
+
+static int read_dma_window(struct scsi_info *vscsi)
+{
+       struct vio_dev *vdev = vscsi->dma_dev;
+       const __be32 *dma_window;
+       const __be32 *prop;
+
+       /* TODO Using of_parse_dma_window would be better, but it doesn't give
+        * a way to read multiple windows without already knowing the size of
+        * a window or the number of windows.
+        */
+       dma_window = (const __be32 *)vio_get_attribute(vdev,
+                                                      "ibm,my-dma-window",
+                                                      NULL);
+       if (!dma_window) {
+               pr_err("Couldn't find ibm,my-dma-window property\n");
+               return -1;
+       }
+
+       vscsi->dds.window[LOCAL].liobn = be32_to_cpu(*dma_window);
+       dma_window++;
+
+       prop = (const __be32 *)vio_get_attribute(vdev, "ibm,#dma-address-cells",
+                                                NULL);
+       if (!prop) {
+               pr_warn("Couldn't find ibm,#dma-address-cells property\n");
+               dma_window++;
+       } else {
+               dma_window += be32_to_cpu(*prop);
+       }
+
+       prop = (const __be32 *)vio_get_attribute(vdev, "ibm,#dma-size-cells",
+                                                NULL);
+       if (!prop) {
+               pr_warn("Couldn't find ibm,#dma-size-cells property\n");
+               dma_window++;
+       } else {
+               dma_window += be32_to_cpu(*prop);
+       }
+
+       /* dma_window should point to the second window now */
+       vscsi->dds.window[REMOTE].liobn = be32_to_cpu(*dma_window);
+
+       return 0;
+}
+
+static struct ibmvscsis_tport *ibmvscsis_lookup_port(const char *name)
+{
+       struct ibmvscsis_tport *tport = NULL;
+       struct vio_dev *vdev;
+       struct scsi_info *vscsi;
+
+       spin_lock_bh(&ibmvscsis_dev_lock);
+       list_for_each_entry(vscsi, &ibmvscsis_dev_list, list) {
+               vdev = vscsi->dma_dev;
+               if (!strcmp(dev_name(&vdev->dev), name)) {
+                       tport = &vscsi->tport;
+                       break;
+               }
+       }
+       spin_unlock_bh(&ibmvscsis_dev_lock);
+
+       return tport;
+}
+
+/**
+ * ibmvscsis_parse_cmd() - Parse SRP Command
+ * @vscsi:     Pointer to our adapter structure
+ * @cmd:       Pointer to command element with SRP command
+ *
+ * Parse the srp command; if it is valid then submit it to tcm.
+ * Note: The return code does not reflect the status of the SCSI CDB.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Process level
+ */
+static void ibmvscsis_parse_cmd(struct scsi_info *vscsi,
+                               struct ibmvscsis_cmd *cmd)
+{
+       struct iu_entry *iue = cmd->iue;
+       struct srp_cmd *srp = (struct srp_cmd *)iue->sbuf->buf;
+       struct ibmvscsis_nexus *nexus;
+       u64 data_len = 0;
+       enum dma_data_direction dir;
+       int attr = 0;
+       int rc = 0;
+
+       nexus = vscsi->tport.ibmv_nexus;
+       /*
+        * additional length in bytes.  Note that the SRP spec says that
+        * additional length is in 4-byte words, but technically the
+        * additional length field is only the upper 6 bits of the byte.
+        * The lower 2 bits are reserved.  If the lower 2 bits are 0 (as
+        * all reserved fields should be), then interpreting the byte as
+        * an int will yield the length in bytes.
+        */
+       if (srp->add_cdb_len & 0x03) {
+               dev_err(&vscsi->dev, "parse_cmd: reserved bits set in IU\n");
+               spin_lock_bh(&vscsi->intr_lock);
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+               ibmvscsis_free_cmd_resources(vscsi, cmd);
+               spin_unlock_bh(&vscsi->intr_lock);
+               return;
+       }
+
+       if (srp_get_desc_table(srp, &dir, &data_len)) {
+               dev_err(&vscsi->dev, "0x%llx: parsing SRP descriptor table failed.\n",
+                       srp->tag);
+               goto fail;
+               return;
+       }
+
+       cmd->rsp.sol_not = srp->sol_not;
+
+       switch (srp->task_attr) {
+       case SRP_SIMPLE_TASK:
+               attr = TCM_SIMPLE_TAG;
+               break;
+       case SRP_ORDERED_TASK:
+               attr = TCM_ORDERED_TAG;
+               break;
+       case SRP_HEAD_TASK:
+               attr = TCM_HEAD_TAG;
+               break;
+       case SRP_ACA_TASK:
+               attr = TCM_ACA_TAG;
+               break;
+       default:
+               dev_err(&vscsi->dev, "Invalid task attribute %d\n",
+                       srp->task_attr);
+               goto fail;
+       }
+
+       cmd->se_cmd.tag = be64_to_cpu(srp->tag);
+
+       spin_lock_bh(&vscsi->intr_lock);
+       list_add_tail(&cmd->list, &vscsi->active_q);
+       spin_unlock_bh(&vscsi->intr_lock);
+
+       srp->lun.scsi_lun[0] &= 0x3f;
+
+       pr_debug("calling submit_cmd, se_cmd %p, lun 0x%llx, cdb 0x%x, attr:%d\n",
+                &cmd->se_cmd, scsilun_to_int(&srp->lun), (int)srp->cdb[0],
+                attr);
+
+       rc = target_submit_cmd(&cmd->se_cmd, nexus->se_sess, srp->cdb,
+                              cmd->sense_buf, scsilun_to_int(&srp->lun),
+                              data_len, attr, dir, 0);
+       if (rc) {
+               dev_err(&vscsi->dev, "target_submit_cmd failed, rc %d\n", rc);
+               goto fail;
+       }
+       return;
+
+fail:
+       spin_lock_bh(&vscsi->intr_lock);
+       ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+       spin_unlock_bh(&vscsi->intr_lock);
+}
+
+/**
+ * ibmvscsis_parse_task() - Parse SRP Task Management Request
+ * @vscsi:     Pointer to our adapter structure
+ * @cmd:       Pointer to command element with SRP task management request
+ *
+ * Parse the srp task management request; if it is valid then submit it to tcm.
+ * Note: The return code does not reflect the status of the task management
+ * request.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Processor level
+ */
+static void ibmvscsis_parse_task(struct scsi_info *vscsi,
+                                struct ibmvscsis_cmd *cmd)
+{
+       struct iu_entry *iue = cmd->iue;
+       struct srp_tsk_mgmt *srp_tsk = &vio_iu(iue)->srp.tsk_mgmt;
+       int tcm_type;
+       u64 tag_to_abort = 0;
+       int rc = 0;
+       struct ibmvscsis_nexus *nexus;
+
+       nexus = vscsi->tport.ibmv_nexus;
+
+       cmd->rsp.sol_not = srp_tsk->sol_not;
+
+       switch (srp_tsk->tsk_mgmt_func) {
+       case SRP_TSK_ABORT_TASK:
+               tcm_type = TMR_ABORT_TASK;
+               tag_to_abort = be64_to_cpu(srp_tsk->task_tag);
+               break;
+       case SRP_TSK_ABORT_TASK_SET:
+               tcm_type = TMR_ABORT_TASK_SET;
+               break;
+       case SRP_TSK_CLEAR_TASK_SET:
+               tcm_type = TMR_CLEAR_TASK_SET;
+               break;
+       case SRP_TSK_LUN_RESET:
+               tcm_type = TMR_LUN_RESET;
+               break;
+       case SRP_TSK_CLEAR_ACA:
+               tcm_type = TMR_CLEAR_ACA;
+               break;
+       default:
+               dev_err(&vscsi->dev, "unknown task mgmt func %d\n",
+                       srp_tsk->tsk_mgmt_func);
+               cmd->se_cmd.se_tmr_req->response =
+                       TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED;
+               rc = -1;
+               break;
+       }
+
+       if (!rc) {
+               cmd->se_cmd.tag = be64_to_cpu(srp_tsk->tag);
+
+               spin_lock_bh(&vscsi->intr_lock);
+               list_add_tail(&cmd->list, &vscsi->active_q);
+               spin_unlock_bh(&vscsi->intr_lock);
+
+               srp_tsk->lun.scsi_lun[0] &= 0x3f;
+
+               pr_debug("calling submit_tmr, func %d\n",
+                        srp_tsk->tsk_mgmt_func);
+               rc = target_submit_tmr(&cmd->se_cmd, nexus->se_sess, NULL,
+                                      scsilun_to_int(&srp_tsk->lun), srp_tsk,
+                                      tcm_type, GFP_KERNEL, tag_to_abort, 0);
+               if (rc) {
+                       dev_err(&vscsi->dev, "target_submit_tmr failed, rc %d\n",
+                               rc);
+                       cmd->se_cmd.se_tmr_req->response =
+                               TMR_FUNCTION_REJECTED;
+               }
+       }
+
+       if (rc)
+               transport_send_check_condition_and_sense(&cmd->se_cmd, 0, 0);
+}
+
+static void ibmvscsis_scheduler(struct work_struct *work)
+{
+       struct ibmvscsis_cmd *cmd = container_of(work, struct ibmvscsis_cmd,
+                                                work);
+       struct scsi_info *vscsi = cmd->adapter;
+
+       spin_lock_bh(&vscsi->intr_lock);
+
+       /* Remove from schedule_q */
+       list_del(&cmd->list);
+
+       /* Don't submit cmd if we're disconnecting */
+       if (vscsi->flags & (SCHEDULE_DISCONNECT | DISCONNECT_SCHEDULED)) {
+               ibmvscsis_free_cmd_resources(vscsi, cmd);
+
+               /* ibmvscsis_disconnect might be waiting for us */
+               if (list_empty(&vscsi->active_q) &&
+                   list_empty(&vscsi->schedule_q) &&
+                   (vscsi->flags & WAIT_FOR_IDLE)) {
+                       vscsi->flags &= ~WAIT_FOR_IDLE;
+                       complete(&vscsi->wait_idle);
+               }
+
+               spin_unlock_bh(&vscsi->intr_lock);
+               return;
+       }
+
+       spin_unlock_bh(&vscsi->intr_lock);
+
+       switch (cmd->type) {
+       case SCSI_CDB:
+               ibmvscsis_parse_cmd(vscsi, cmd);
+               break;
+       case TASK_MANAGEMENT:
+               ibmvscsis_parse_task(vscsi, cmd);
+               break;
+       default:
+               dev_err(&vscsi->dev, "scheduler, invalid cmd type %d\n",
+                       cmd->type);
+               spin_lock_bh(&vscsi->intr_lock);
+               ibmvscsis_free_cmd_resources(vscsi, cmd);
+               spin_unlock_bh(&vscsi->intr_lock);
+               break;
+       }
+}
+
+static int ibmvscsis_alloc_cmds(struct scsi_info *vscsi, int num)
+{
+       struct ibmvscsis_cmd *cmd;
+       int i;
+
+       INIT_LIST_HEAD(&vscsi->free_cmd);
+       vscsi->cmd_pool = kcalloc(num, sizeof(struct ibmvscsis_cmd),
+                                 GFP_KERNEL);
+       if (!vscsi->cmd_pool)
+               return -ENOMEM;
+
+       for (i = 0, cmd = (struct ibmvscsis_cmd *)vscsi->cmd_pool; i < num;
+            i++, cmd++) {
+               cmd->adapter = vscsi;
+               INIT_WORK(&cmd->work, ibmvscsis_scheduler);
+               list_add_tail(&cmd->list, &vscsi->free_cmd);
+       }
+
+       return 0;
+}
+
+static void ibmvscsis_free_cmds(struct scsi_info *vscsi)
+{
+       kfree(vscsi->cmd_pool);
+       vscsi->cmd_pool = NULL;
+       INIT_LIST_HEAD(&vscsi->free_cmd);
+}
+
+/**
+ * ibmvscsis_service_wait_q() - Service Waiting Queue
+ * @timer:     Pointer to timer which has expired
+ *
+ * This routine is called when the timer pops to service the waiting
+ * queue. Elements on the queue have completed, their responses have been
+ * copied to the client, but the client's response queue was full so
+ * the queue message could not be sent. The routine grabs the proper locks
+ * and calls send messages.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     called at interrupt level
+ */
+static enum hrtimer_restart ibmvscsis_service_wait_q(struct hrtimer *timer)
+{
+       struct timer_cb *p_timer = container_of(timer, struct timer_cb, timer);
+       struct scsi_info *vscsi = container_of(p_timer, struct scsi_info,
+                                              rsp_q_timer);
+
+       spin_lock_bh(&vscsi->intr_lock);
+       p_timer->timer_pops += 1;
+       p_timer->started = false;
+       ibmvscsis_send_messages(vscsi);
+       spin_unlock_bh(&vscsi->intr_lock);
+
+       return HRTIMER_NORESTART;
+}
+
+static long ibmvscsis_alloctimer(struct scsi_info *vscsi)
+{
+       struct timer_cb *p_timer;
+
+       p_timer = &vscsi->rsp_q_timer;
+       hrtimer_init(&p_timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+
+       p_timer->timer.function = ibmvscsis_service_wait_q;
+       p_timer->started = false;
+       p_timer->timer_pops = 0;
+
+       return ADAPT_SUCCESS;
+}
+
+static void ibmvscsis_freetimer(struct scsi_info *vscsi)
+{
+       struct timer_cb *p_timer;
+
+       p_timer = &vscsi->rsp_q_timer;
+
+       (void)hrtimer_cancel(&p_timer->timer);
+
+       p_timer->started = false;
+       p_timer->timer_pops = 0;
+}
+
+static irqreturn_t ibmvscsis_interrupt(int dummy, void *data)
+{
+       struct scsi_info *vscsi = data;
+
+       vio_disable_interrupts(vscsi->dma_dev);
+       tasklet_schedule(&vscsi->work_task);
+
+       return IRQ_HANDLED;
+}
+
+/**
+ * ibmvscsis_check_q() - Helper function to Check Init Message Valid
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * Checks if a initialize message was queued by the initiatior
+ * while the timing window was open.  This function is called from
+ * probe after the CRQ is created and interrupts are enabled.
+ * It would only be used by adapters who wait for some event before
+ * completing the init handshake with the client.  For ibmvscsi, this
+ * event is waiting for the port to be enabled.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Process level only, interrupt lock held
+ */
+static long ibmvscsis_check_q(struct scsi_info *vscsi)
+{
+       uint format;
+       long rc;
+
+       rc = ibmvscsis_check_init_msg(vscsi, &format);
+       if (rc)
+               ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0);
+       else if (format == UNUSED_FORMAT)
+               vscsi->state = WAIT_ENABLED;
+       else
+               vscsi->state = PART_UP_WAIT_ENAB;
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_enable_change_state() - Set new state based on enabled status
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * This function determines our new state now that we are enabled.  This
+ * may involve sending an Init Complete message to the client.
+ *
+ * Must be called with interrupt lock held.
+ */
+static long ibmvscsis_enable_change_state(struct scsi_info *vscsi)
+{
+       long rc = ADAPT_SUCCESS;
+
+handle_state_change:
+       switch (vscsi->state) {
+       case WAIT_ENABLED:
+               rc = ibmvscsis_send_init_message(vscsi, INIT_MSG);
+               switch (rc) {
+               case H_SUCCESS:
+               case H_DROPPED:
+               case H_CLOSED:
+                       vscsi->state =  WAIT_CONNECTION;
+                       rc = ADAPT_SUCCESS;
+                       break;
+
+               case H_PARAMETER:
+                       break;
+
+               case H_HARDWARE:
+                       break;
+
+               default:
+                       vscsi->state = UNDEFINED;
+                       rc = H_HARDWARE;
+                       break;
+               }
+               break;
+       case PART_UP_WAIT_ENAB:
+               rc = ibmvscsis_send_init_message(vscsi, INIT_COMPLETE_MSG);
+               switch (rc) {
+               case H_SUCCESS:
+                       vscsi->state = CONNECTED;
+                       rc = ADAPT_SUCCESS;
+                       break;
+
+               case H_DROPPED:
+               case H_CLOSED:
+                       vscsi->state = WAIT_ENABLED;
+                       goto handle_state_change;
+
+               case H_PARAMETER:
+                       break;
+
+               case H_HARDWARE:
+                       break;
+
+               default:
+                       rc = H_HARDWARE;
+                       break;
+               }
+               break;
+
+       case WAIT_CONNECTION:
+       case WAIT_IDLE:
+       case SRP_PROCESSING:
+       case CONNECTED:
+               rc = ADAPT_SUCCESS;
+               break;
+               /* should not be able to get here */
+       case UNCONFIGURING:
+               rc = ERROR;
+               vscsi->state = UNDEFINED;
+               break;
+
+               /* driver should never allow this to happen */
+       case ERR_DISCONNECT:
+       case ERR_DISCONNECT_RECONNECT:
+       default:
+               dev_err(&vscsi->dev, "in invalid state %d during enable_change_state\n",
+                       vscsi->state);
+               rc = ADAPT_SUCCESS;
+               break;
+       }
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_create_command_q() - Create Command Queue
+ * @vscsi:     Pointer to our adapter structure
+ * @num_cmds:  Currently unused.  In the future, may be used to determine
+ *             the size of the CRQ.
+ *
+ * Allocates memory for command queue maps remote memory into an ioba
+ * initializes the command response queue
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Process level only
+ */
+static long ibmvscsis_create_command_q(struct scsi_info *vscsi, int num_cmds)
+{
+       long rc = 0;
+       int pages;
+       struct vio_dev *vdev = vscsi->dma_dev;
+
+       /* We might support multiple pages in the future, but just 1 for now */
+       pages = 1;
+
+       vscsi->cmd_q.size = pages;
+
+       vscsi->cmd_q.base_addr =
+               (struct viosrp_crq *)get_zeroed_page(GFP_KERNEL);
+       if (!vscsi->cmd_q.base_addr)
+               return -ENOMEM;
+
+       vscsi->cmd_q.mask = ((uint)pages * CRQ_PER_PAGE) - 1;
+
+       vscsi->cmd_q.crq_token = dma_map_single(&vdev->dev,
+                                               vscsi->cmd_q.base_addr,
+                                               PAGE_SIZE, DMA_BIDIRECTIONAL);
+       if (dma_mapping_error(&vdev->dev, vscsi->cmd_q.crq_token)) {
+               free_page((unsigned long)vscsi->cmd_q.base_addr);
+               return -ENOMEM;
+       }
+
+       rc =  h_reg_crq(vscsi->dds.unit_id, vscsi->cmd_q.crq_token, PAGE_SIZE);
+       if (rc) {
+               if (rc == H_CLOSED) {
+                       vscsi->state = WAIT_ENABLED;
+                       rc = 0;
+               } else {
+                       dma_unmap_single(&vdev->dev, vscsi->cmd_q.crq_token,
+                                        PAGE_SIZE, DMA_BIDIRECTIONAL);
+                       free_page((unsigned long)vscsi->cmd_q.base_addr);
+                       rc = -ENODEV;
+               }
+       } else {
+               vscsi->state = WAIT_ENABLED;
+       }
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_destroy_command_q - Destroy Command Queue
+ * @vscsi:     Pointer to our adapter structure
+ *
+ * Releases memory for command queue and unmaps mapped remote memory.
+ *
+ * EXECUTION ENVIRONMENT:
+ *     Process level only
+ */
+static void ibmvscsis_destroy_command_q(struct scsi_info *vscsi)
+{
+       dma_unmap_single(&vscsi->dma_dev->dev, vscsi->cmd_q.crq_token,
+                        PAGE_SIZE, DMA_BIDIRECTIONAL);
+       free_page((unsigned long)vscsi->cmd_q.base_addr);
+       vscsi->cmd_q.base_addr = NULL;
+       vscsi->state = NO_QUEUE;
+}
+
+static u8 ibmvscsis_fast_fail(struct scsi_info *vscsi,
+                             struct ibmvscsis_cmd *cmd)
+{
+       struct iu_entry *iue = cmd->iue;
+       struct se_cmd *se_cmd = &cmd->se_cmd;
+       struct srp_cmd *srp = (struct srp_cmd *)iue->sbuf->buf;
+       struct scsi_sense_hdr sshdr;
+       u8 rc = se_cmd->scsi_status;
+
+       if (vscsi->fast_fail && (READ_CMD(srp->cdb) || WRITE_CMD(srp->cdb)))
+               if (scsi_normalize_sense(se_cmd->sense_buffer,
+                                        se_cmd->scsi_sense_length, &sshdr))
+                       if (sshdr.sense_key == HARDWARE_ERROR &&
+                           (se_cmd->residual_count == 0 ||
+                            se_cmd->residual_count == se_cmd->data_length)) {
+                               rc = NO_SENSE;
+                               cmd->flags |= CMD_FAST_FAIL;
+                       }
+
+       return rc;
+}
+
+/**
+ * srp_build_response() - Build an SRP response buffer
+ * @vscsi:     Pointer to our adapter structure
+ * @cmd:       Pointer to command for which to send the response
+ * @len_p:     Where to return the length of the IU response sent.  This
+ *             is needed to construct the CRQ response.
+ *
+ * Build the SRP response buffer and copy it to the client's memory space.
+ */
+static long srp_build_response(struct scsi_info *vscsi,
+                              struct ibmvscsis_cmd *cmd, uint *len_p)
+{
+       struct iu_entry *iue = cmd->iue;
+       struct se_cmd *se_cmd = &cmd->se_cmd;
+       struct srp_rsp *rsp;
+       uint len;
+       u32 rsp_code;
+       char *data;
+       u32 *tsk_status;
+       long rc = ADAPT_SUCCESS;
+
+       spin_lock_bh(&vscsi->intr_lock);
+
+       rsp = &vio_iu(iue)->srp.rsp;
+       len = sizeof(*rsp);
+       memset(rsp, 0, len);
+       data = rsp->data;
+
+       rsp->opcode = SRP_RSP;
+
+       if (vscsi->credit > 0 && vscsi->state == SRP_PROCESSING)
+               rsp->req_lim_delta = cpu_to_be32(vscsi->credit);
+       else
+               rsp->req_lim_delta = cpu_to_be32(1 + vscsi->credit);
+       rsp->tag = cmd->rsp.tag;
+       rsp->flags = 0;
+
+       if (cmd->type == SCSI_CDB) {
+               rsp->status = ibmvscsis_fast_fail(vscsi, cmd);
+               if (rsp->status) {
+                       pr_debug("build_resp: cmd %p, scsi status %d\n", cmd,
+                                (int)rsp->status);
+                       ibmvscsis_determine_resid(se_cmd, rsp);
+                       if (se_cmd->scsi_sense_length && se_cmd->sense_buffer) {
+                               rsp->sense_data_len =
+                                       cpu_to_be32(se_cmd->scsi_sense_length);
+                               rsp->flags |= SRP_RSP_FLAG_SNSVALID;
+                               len += se_cmd->scsi_sense_length;
+                               memcpy(data, se_cmd->sense_buffer,
+                                      se_cmd->scsi_sense_length);
+                       }
+                       rsp->sol_not = (cmd->rsp.sol_not & UCSOLNT) >>
+                               UCSOLNT_RESP_SHIFT;
+               } else if (cmd->flags & CMD_FAST_FAIL) {
+                       pr_debug("build_resp: cmd %p, fast fail\n", cmd);
+                       rsp->sol_not = (cmd->rsp.sol_not & UCSOLNT) >>
+                               UCSOLNT_RESP_SHIFT;
+               } else {
+                       rsp->sol_not = (cmd->rsp.sol_not & SCSOLNT) >>
+                               SCSOLNT_RESP_SHIFT;
+               }
+       } else {
+               /* this is task management */
+               rsp->status = 0;
+               rsp->resp_data_len = cpu_to_be32(4);
+               rsp->flags |= SRP_RSP_FLAG_RSPVALID;
+
+               switch (se_cmd->se_tmr_req->response) {
+               case TMR_FUNCTION_COMPLETE:
+               case TMR_TASK_DOES_NOT_EXIST:
+                       rsp_code = SRP_TASK_MANAGEMENT_FUNCTION_COMPLETE;
+                       rsp->sol_not = (cmd->rsp.sol_not & SCSOLNT) >>
+                               SCSOLNT_RESP_SHIFT;
+                       break;
+               case TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED:
+               case TMR_LUN_DOES_NOT_EXIST:
+                       rsp_code = SRP_TASK_MANAGEMENT_FUNCTION_NOT_SUPPORTED;
+                       rsp->sol_not = (cmd->rsp.sol_not & UCSOLNT) >>
+                               UCSOLNT_RESP_SHIFT;
+                       break;
+               case TMR_FUNCTION_FAILED:
+               case TMR_FUNCTION_REJECTED:
+               default:
+                       rsp_code = SRP_TASK_MANAGEMENT_FUNCTION_FAILED;
+                       rsp->sol_not = (cmd->rsp.sol_not & UCSOLNT) >>
+                               UCSOLNT_RESP_SHIFT;
+                       break;
+               }
+
+               tsk_status = (u32 *)data;
+               *tsk_status = cpu_to_be32(rsp_code);
+               data = (char *)(tsk_status + 1);
+               len += 4;
+       }
+
+       dma_wmb();
+       rc = h_copy_rdma(len, vscsi->dds.window[LOCAL].liobn, iue->sbuf->dma,
+                        vscsi->dds.window[REMOTE].liobn,
+                        be64_to_cpu(iue->remote_token));
+
+       switch (rc) {
+       case H_SUCCESS:
+               vscsi->credit = 0;
+               *len_p = len;
+               break;
+       case H_PERMISSION:
+               if (connection_broken(vscsi))
+                       vscsi->flags |= RESPONSE_Q_DOWN | CLIENT_FAILED;
+
+               dev_err(&vscsi->dev, "build_response: error copying to client, rc %ld, flags 0x%x, state 0x%hx\n",
+                       rc, vscsi->flags, vscsi->state);
+               break;
+       case H_SOURCE_PARM:
+       case H_DEST_PARM:
+       default:
+               dev_err(&vscsi->dev, "build_response: error copying to client, rc %ld\n",
+                       rc);
+               break;
+       }
+
+       spin_unlock_bh(&vscsi->intr_lock);
+
+       return rc;
+}
+
+static int ibmvscsis_rdma(struct ibmvscsis_cmd *cmd, struct scatterlist *sg,
+                         int nsg, struct srp_direct_buf *md, int nmd,
+                         enum dma_data_direction dir, unsigned int bytes)
+{
+       struct iu_entry *iue = cmd->iue;
+       struct srp_target *target = iue->target;
+       struct scsi_info *vscsi = target->ldata;
+       struct scatterlist *sgp;
+       dma_addr_t client_ioba, server_ioba;
+       ulong buf_len;
+       ulong client_len, server_len;
+       int md_idx;
+       long tx_len;
+       long rc = 0;
+
+       pr_debug("rdma: dir %d, bytes 0x%x\n", dir, bytes);
+
+       if (bytes == 0)
+               return 0;
+
+       sgp = sg;
+       client_len = 0;
+       server_len = 0;
+       md_idx = 0;
+       tx_len = bytes;
+
+       do {
+               if (client_len == 0) {
+                       if (md_idx >= nmd) {
+                               dev_err(&vscsi->dev, "rdma: ran out of client memory descriptors\n");
+                               rc = -EIO;
+                               break;
+                       }
+                       client_ioba = be64_to_cpu(md[md_idx].va);
+                       client_len = be32_to_cpu(md[md_idx].len);
+               }
+               if (server_len == 0) {
+                       if (!sgp) {
+                               dev_err(&vscsi->dev, "rdma: ran out of scatter/gather list\n");
+                               rc = -EIO;
+                               break;
+                       }
+                       server_ioba = sg_dma_address(sgp);
+                       server_len = sg_dma_len(sgp);
+               }
+
+               buf_len = tx_len;
+
+               if (buf_len > client_len)
+                       buf_len = client_len;
+
+               if (buf_len > server_len)
+                       buf_len = server_len;
+
+               if (buf_len > max_vdma_size)
+                       buf_len = max_vdma_size;
+
+               if (dir == DMA_TO_DEVICE) {
+                       /* read from client */
+                       rc = h_copy_rdma(buf_len,
+                                        vscsi->dds.window[REMOTE].liobn,
+                                        client_ioba,
+                                        vscsi->dds.window[LOCAL].liobn,
+                                        server_ioba);
+               } else {
+                       /* write to client */
+                       struct srp_cmd *srp = (struct srp_cmd *)iue->sbuf->buf;
+
+                       if (!READ_CMD(srp->cdb))
+                               print_hex_dump_bytes(" data:", DUMP_PREFIX_NONE,
+                                                    sg_virt(sgp), buf_len);
+                       /* The h_copy_rdma will cause phyp, running in another
+                        * partition, to read memory, so we need to make sure
+                        * the data has been written out, hence these syncs.
+                        */
+                       /* ensure that everything is in memory */
+                       isync();
+                       /* ensure that memory has been made visible */
+                       dma_wmb();
+                       rc = h_copy_rdma(buf_len,
+                                        vscsi->dds.window[LOCAL].liobn,
+                                        server_ioba,
+                                        vscsi->dds.window[REMOTE].liobn,
+                                        client_ioba);
+               }
+               switch (rc) {
+               case H_SUCCESS:
+                       break;
+               case H_PERMISSION:
+               case H_SOURCE_PARM:
+               case H_DEST_PARM:
+                       if (connection_broken(vscsi)) {
+                               spin_lock_bh(&vscsi->intr_lock);
+                               vscsi->flags |=
+                                       (RESPONSE_Q_DOWN | CLIENT_FAILED);
+                               spin_unlock_bh(&vscsi->intr_lock);
+                       }
+                       dev_err(&vscsi->dev, "rdma: h_copy_rdma failed, rc %ld\n",
+                               rc);
+                       break;
+
+               default:
+                       dev_err(&vscsi->dev, "rdma: unknown error %ld from h_copy_rdma\n",
+                               rc);
+                       break;
+               }
+
+               if (!rc) {
+                       tx_len -= buf_len;
+                       if (tx_len) {
+                               client_len -= buf_len;
+                               if (client_len == 0)
+                                       md_idx++;
+                               else
+                                       client_ioba += buf_len;
+
+                               server_len -= buf_len;
+                               if (server_len == 0)
+                                       sgp = sg_next(sgp);
+                               else
+                                       server_ioba += buf_len;
+                       } else {
+                               break;
+                       }
+               }
+       } while (!rc);
+
+       return rc;
+}
+
+/**
+ * ibmvscsis_handle_crq() - Handle CRQ
+ * @data:      Pointer to our adapter structure
+ *
+ * Read the command elements from the command queue and copy the payloads
+ * associated with the command elements to local memory and execute the
+ * SRP requests.
+ *
+ * Note: this is an edge triggered interrupt. It can not be shared.
+ */
+static void ibmvscsis_handle_crq(unsigned long data)
+{
+       struct scsi_info *vscsi = (struct scsi_info *)data;
+       struct viosrp_crq *crq;
+       long rc;
+       bool ack = true;
+       volatile u8 valid;
+
+       spin_lock_bh(&vscsi->intr_lock);
+
+       pr_debug("got interrupt\n");
+
+       /*
+        * if we are in a path where we are waiting for all pending commands
+        * to complete because we received a transport event and anything in
+        * the command queue is for a new connection,  do nothing
+        */
+       if (TARGET_STOP(vscsi)) {
+               vio_enable_interrupts(vscsi->dma_dev);
+
+               pr_debug("handle_crq, don't process: flags 0x%x, state 0x%hx\n",
+                        vscsi->flags, vscsi->state);
+               spin_unlock_bh(&vscsi->intr_lock);
+               return;
+       }
+
+       rc = vscsi->flags & SCHEDULE_DISCONNECT;
+       crq = vscsi->cmd_q.base_addr + vscsi->cmd_q.index;
+       valid = crq->valid;
+       dma_rmb();
+
+       while (valid) {
+               /*
+                * These are edege triggered interrupts. After dropping out of
+                * the while loop, the code must check for work since an
+                * interrupt could be lost, and an elment be left on the queue,
+                * hence the label.
+                */
+cmd_work:
+               vscsi->cmd_q.index =
+                       (vscsi->cmd_q.index + 1) & vscsi->cmd_q.mask;
+
+               if (!rc) {
+                       rc = ibmvscsis_parse_command(vscsi, crq);
+               } else {
+                       if ((uint)crq->valid == VALID_TRANS_EVENT) {
+                               /*
+                                * must service the transport layer events even
+                                * in an error state, dont break out until all
+                                * the consecutive transport events have been
+                                * processed
+                                */
+                               rc = ibmvscsis_trans_event(vscsi, crq);
+                       } else if (vscsi->flags & TRANS_EVENT) {
+                               /*
+                                * if a tranport event has occurred leave
+                                * everything but transport events on the queue
+                                */
+                               pr_debug("handle_crq, ignoring\n");
+
+                               /*
+                                * need to decrement the queue index so we can
+                                * look at the elment again
+                                */
+                               if (vscsi->cmd_q.index)
+                                       vscsi->cmd_q.index -= 1;
+                               else
+                                       /*
+                                        * index is at 0 it just wrapped.
+                                        * have it index last element in q
+                                        */
+                                       vscsi->cmd_q.index = vscsi->cmd_q.mask;
+                               break;
+                       }
+               }
+
+               crq->valid = INVALIDATE_CMD_RESP_EL;
+
+               crq = vscsi->cmd_q.base_addr + vscsi->cmd_q.index;
+               valid = crq->valid;
+               dma_rmb();
+       }
+
+       if (!rc) {
+               if (ack) {
+                       vio_enable_interrupts(vscsi->dma_dev);
+                       ack = false;
+                       pr_debug("handle_crq, reenabling interrupts\n");
+               }
+               valid = crq->valid;
+               dma_rmb();
+               if (valid)
+                       goto cmd_work;
+       } else {
+               pr_debug("handle_crq, error: flags 0x%x, state 0x%hx, crq index 0x%x\n",
+                        vscsi->flags, vscsi->state, vscsi->cmd_q.index);
+       }
+
+       pr_debug("Leaving handle_crq: schedule_q empty %d, flags 0x%x, state 0x%hx\n",
+                (int)list_empty(&vscsi->schedule_q), vscsi->flags,
+                vscsi->state);
+
+       spin_unlock_bh(&vscsi->intr_lock);
+}
+
+static int ibmvscsis_probe(struct vio_dev *vdev,
+                          const struct vio_device_id *id)
+{
+       struct scsi_info *vscsi;
+       int rc = 0;
+       long hrc = 0;
+       char wq_name[24];
+
+       vscsi = kzalloc(sizeof(*vscsi), GFP_KERNEL);
+       if (!vscsi) {
+               rc = -ENOMEM;
+               pr_err("probe: allocation of adapter failed\n");
+               return rc;
+       }
+
+       vscsi->dma_dev = vdev;
+       vscsi->dev = vdev->dev;
+       INIT_LIST_HEAD(&vscsi->schedule_q);
+       INIT_LIST_HEAD(&vscsi->waiting_rsp);
+       INIT_LIST_HEAD(&vscsi->active_q);
+
+       snprintf(vscsi->tport.tport_name, 256, "%s", dev_name(&vdev->dev));
+
+       pr_debug("probe tport_name: %s\n", vscsi->tport.tport_name);
+
+       rc = read_dma_window(vscsi);
+       if (rc)
+               goto free_adapter;
+       pr_debug("Probe: liobn 0x%x, riobn 0x%x\n",
+                vscsi->dds.window[LOCAL].liobn,
+                vscsi->dds.window[REMOTE].liobn);
+
+       strcpy(vscsi->eye, "VSCSI ");
+       strncat(vscsi->eye, vdev->name, MAX_EYE);
+
+       vscsi->dds.unit_id = vdev->unit_address;
+
+       spin_lock_bh(&ibmvscsis_dev_lock);
+       list_add_tail(&vscsi->list, &ibmvscsis_dev_list);
+       spin_unlock_bh(&ibmvscsis_dev_lock);
+
+       /*
+        * TBD: How do we determine # of cmds to request?  Do we know how
+        * many "children" we have?
+        */
+       vscsi->request_limit = INITIAL_SRP_LIMIT;
+       rc = srp_target_alloc(&vscsi->target, &vdev->dev, vscsi->request_limit,
+                             SRP_MAX_IU_LEN);
+       if (rc)
+               goto rem_list;
+
+       vscsi->target.ldata = vscsi;
+
+       rc = ibmvscsis_alloc_cmds(vscsi, vscsi->request_limit);
+       if (rc) {
+               dev_err(&vscsi->dev, "alloc_cmds failed, rc %d, num %d\n",
+                       rc, vscsi->request_limit);
+               goto free_target;
+       }
+
+       /*
+        * Note: the lock is used in freeing timers, so must initialize
+        * first so that ordering in case of error is correct.
+        */
+       spin_lock_init(&vscsi->intr_lock);
+
+       rc = ibmvscsis_alloctimer(vscsi);
+       if (rc) {
+               dev_err(&vscsi->dev, "probe: alloctimer failed, rc %d\n", rc);
+               goto free_cmds;
+       }
+
+       rc = ibmvscsis_create_command_q(vscsi, 256);
+       if (rc) {
+               dev_err(&vscsi->dev, "probe: create_command_q failed, rc %d\n",
+                       rc);
+               goto free_timer;
+       }
+
+       vscsi->map_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!vscsi->map_buf) {
+               rc = -ENOMEM;
+               dev_err(&vscsi->dev, "probe: allocating cmd buffer failed\n");
+               goto destroy_queue;
+       }
+
+       vscsi->map_ioba = dma_map_single(&vdev->dev, vscsi->map_buf, PAGE_SIZE,
+                                        DMA_BIDIRECTIONAL);
+       if (dma_mapping_error(&vdev->dev, vscsi->map_ioba)) {
+               dev_err(&vscsi->dev, "probe: error mapping command buffer\n");
+               goto free_buf;
+       }
+
+       hrc = h_vioctl(vscsi->dds.unit_id, H_GET_PARTNER_INFO,
+                      (u64)vscsi->map_ioba | ((u64)PAGE_SIZE << 32), 0, 0, 0,
+                      0);
+       if (hrc == H_SUCCESS)
+               vscsi->client_data.partition_number =
+                       be64_to_cpu(*(u64 *)vscsi->map_buf);
+       /*
+        * We expect the VIOCTL to fail if we're configured as "any
+        * client can connect" and the client isn't activated yet.
+        * We'll make the call again when he sends an init msg.
+        */
+       pr_debug("probe hrc %ld, client partition num %d\n",
+                hrc, vscsi->client_data.partition_number);
+
+       tasklet_init(&vscsi->work_task, ibmvscsis_handle_crq,
+                    (unsigned long)vscsi);
+
+       init_completion(&vscsi->wait_idle);
+
+       snprintf(wq_name, 24, "ibmvscsis%s", dev_name(&vdev->dev));
+       vscsi->work_q = create_workqueue(wq_name);
+       if (!vscsi->work_q) {
+               rc = -ENOMEM;
+               dev_err(&vscsi->dev, "create_workqueue failed\n");
+               goto unmap_buf;
+       }
+
+       rc = request_irq(vdev->irq, ibmvscsis_interrupt, 0, "ibmvscsis", vscsi);
+       if (rc) {
+               rc = -EPERM;
+               dev_err(&vscsi->dev, "probe: request_irq failed, rc %d\n", rc);
+               goto destroy_WQ;
+       }
+
+       spin_lock_bh(&vscsi->intr_lock);
+       vio_enable_interrupts(vdev);
+       if (rc) {
+               dev_err(&vscsi->dev, "enabling interrupts failed, rc %d\n", rc);
+               rc = -ENODEV;
+               spin_unlock_bh(&vscsi->intr_lock);
+               goto free_irq;
+       }
+
+       if (ibmvscsis_check_q(vscsi)) {
+               rc = ERROR;
+               dev_err(&vscsi->dev, "probe: check_q failed, rc %d\n", rc);
+               spin_unlock_bh(&vscsi->intr_lock);
+               goto disable_interrupt;
+       }
+       spin_unlock_bh(&vscsi->intr_lock);
+
+       dev_set_drvdata(&vdev->dev, vscsi);
+
+       return 0;
+
+disable_interrupt:
+       vio_disable_interrupts(vdev);
+free_irq:
+       free_irq(vdev->irq, vscsi);
+destroy_WQ:
+       destroy_workqueue(vscsi->work_q);
+unmap_buf:
+       dma_unmap_single(&vdev->dev, vscsi->map_ioba, PAGE_SIZE,
+                        DMA_BIDIRECTIONAL);
+free_buf:
+       kfree(vscsi->map_buf);
+destroy_queue:
+       tasklet_kill(&vscsi->work_task);
+       ibmvscsis_unregister_command_q(vscsi);
+       ibmvscsis_destroy_command_q(vscsi);
+free_timer:
+       ibmvscsis_freetimer(vscsi);
+free_cmds:
+       ibmvscsis_free_cmds(vscsi);
+free_target:
+       srp_target_free(&vscsi->target);
+rem_list:
+       spin_lock_bh(&ibmvscsis_dev_lock);
+       list_del(&vscsi->list);
+       spin_unlock_bh(&ibmvscsis_dev_lock);
+free_adapter:
+       kfree(vscsi);
+
+       return rc;
+}
+
+static int ibmvscsis_remove(struct vio_dev *vdev)
+{
+       struct scsi_info *vscsi = dev_get_drvdata(&vdev->dev);
+
+       pr_debug("remove (%s)\n", dev_name(&vscsi->dma_dev->dev));
+
+       /*
+        * TBD: Need to handle if there are commands on the waiting_rsp q
+        *      Actually, can there still be cmds outstanding to tcm?
+        */
+
+       vio_disable_interrupts(vdev);
+       free_irq(vdev->irq, vscsi);
+       destroy_workqueue(vscsi->work_q);
+       dma_unmap_single(&vdev->dev, vscsi->map_ioba, PAGE_SIZE,
+                        DMA_BIDIRECTIONAL);
+       kfree(vscsi->map_buf);
+       tasklet_kill(&vscsi->work_task);
+       ibmvscsis_unregister_command_q(vscsi);
+       ibmvscsis_destroy_command_q(vscsi);
+       ibmvscsis_freetimer(vscsi);
+       ibmvscsis_free_cmds(vscsi);
+       srp_target_free(&vscsi->target);
+       spin_lock_bh(&ibmvscsis_dev_lock);
+       list_del(&vscsi->list);
+       spin_unlock_bh(&ibmvscsis_dev_lock);
+       kfree(vscsi);
+
+       return 0;
+}
+
+static ssize_t system_id_show(struct device *dev,
+                             struct device_attribute *attr, char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%s\n", system_id);
+}
+
+static ssize_t partition_number_show(struct device *dev,
+                                    struct device_attribute *attr, char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%x\n", partition_number);
+}
+
+static ssize_t unit_address_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       struct scsi_info *vscsi = container_of(dev, struct scsi_info, dev);
+
+       return snprintf(buf, PAGE_SIZE, "%x\n", vscsi->dma_dev->unit_address);
+}
+
+static int ibmvscsis_get_system_info(void)
+{
+       struct device_node *rootdn, *vdevdn;
+       const char *id, *model, *name;
+       const uint *num;
+
+       rootdn = of_find_node_by_path("/");
+       if (!rootdn)
+               return -ENOENT;
+
+       model = of_get_property(rootdn, "model", NULL);
+       id = of_get_property(rootdn, "system-id", NULL);
+       if (model && id)
+               snprintf(system_id, sizeof(system_id), "%s-%s", model, id);
+
+       name = of_get_property(rootdn, "ibm,partition-name", NULL);
+       if (name)
+               strncpy(partition_name, name, sizeof(partition_name));
+
+       num = of_get_property(rootdn, "ibm,partition-no", NULL);
+       if (num)
+               partition_number = *num;
+
+       of_node_put(rootdn);
+
+       vdevdn = of_find_node_by_path("/vdevice");
+       if (vdevdn) {
+               const uint *mvds;
+
+               mvds = of_get_property(vdevdn, "ibm,max-virtual-dma-size",
+                                      NULL);
+               if (mvds)
+                       max_vdma_size = *mvds;
+               of_node_put(vdevdn);
+       }
+
+       return 0;
+}
+
+static char *ibmvscsis_get_fabric_name(void)
+{
+       return "ibmvscsis";
+}
+
+static char *ibmvscsis_get_fabric_wwn(struct se_portal_group *se_tpg)
+{
+       struct ibmvscsis_tport *tport =
+               container_of(se_tpg, struct ibmvscsis_tport, se_tpg);
+
+       return tport->tport_name;
+}
+
+static u16 ibmvscsis_get_tag(struct se_portal_group *se_tpg)
+{
+       struct ibmvscsis_tport *tport =
+               container_of(se_tpg, struct ibmvscsis_tport, se_tpg);
+
+       return tport->tport_tpgt;
+}
+
+static u32 ibmvscsis_get_default_depth(struct se_portal_group *se_tpg)
+{
+       return 1;
+}
+
+static int ibmvscsis_check_true(struct se_portal_group *se_tpg)
+{
+       return 1;
+}
+
+static int ibmvscsis_check_false(struct se_portal_group *se_tpg)
+{
+       return 0;
+}
+
+static u32 ibmvscsis_tpg_get_inst_index(struct se_portal_group *se_tpg)
+{
+       return 1;
+}
+
+static int ibmvscsis_check_stop_free(struct se_cmd *se_cmd)
+{
+       return target_put_sess_cmd(se_cmd);
+}
+
+static void ibmvscsis_release_cmd(struct se_cmd *se_cmd)
+{
+       struct ibmvscsis_cmd *cmd = container_of(se_cmd, struct ibmvscsis_cmd,
+                                                se_cmd);
+       struct scsi_info *vscsi = cmd->adapter;
+
+       pr_debug("release_cmd %p, flags %d\n", se_cmd, cmd->flags);
+
+       spin_lock_bh(&vscsi->intr_lock);
+       /* Remove from active_q */
+       list_del(&cmd->list);
+       list_add_tail(&cmd->list, &vscsi->waiting_rsp);
+       ibmvscsis_send_messages(vscsi);
+       spin_unlock_bh(&vscsi->intr_lock);
+}
+
+static u32 ibmvscsis_sess_get_index(struct se_session *se_sess)
+{
+       return 0;
+}
+
+static int ibmvscsis_write_pending(struct se_cmd *se_cmd)
+{
+       struct ibmvscsis_cmd *cmd = container_of(se_cmd, struct ibmvscsis_cmd,
+                                                se_cmd);
+       struct iu_entry *iue = cmd->iue;
+       int rc;
+
+       pr_debug("write_pending, se_cmd %p, length 0x%x\n",
+                se_cmd, se_cmd->data_length);
+
+       rc = srp_transfer_data(cmd, &vio_iu(iue)->srp.cmd, ibmvscsis_rdma,
+                              1, 1);
+       if (rc) {
+               pr_err("srp_transfer_data() failed: %d\n", rc);
+               return -EAGAIN;
+       }
+       /*
+        * We now tell TCM to add this WRITE CDB directly into the TCM storage
+        * object execution queue.
+        */
+       target_execute_cmd(se_cmd);
+       return 0;
+}
+
+static int ibmvscsis_write_pending_status(struct se_cmd *se_cmd)
+{
+       return 0;
+}
+
+static void ibmvscsis_set_default_node_attrs(struct se_node_acl *nacl)
+{
+}
+
+static int ibmvscsis_get_cmd_state(struct se_cmd *se_cmd)
+{
+       return 0;
+}
+
+static int ibmvscsis_queue_data_in(struct se_cmd *se_cmd)
+{
+       struct ibmvscsis_cmd *cmd = container_of(se_cmd, struct ibmvscsis_cmd,
+                                                se_cmd);
+       struct iu_entry *iue = cmd->iue;
+       struct scsi_info *vscsi = cmd->adapter;
+       char *sd;
+       uint len = 0;
+       int rc;
+
+       pr_debug("queue_data_in, se_cmd %p, length 0x%x\n",
+                se_cmd, se_cmd->data_length);
+
+       rc = srp_transfer_data(cmd, &vio_iu(iue)->srp.cmd, ibmvscsis_rdma, 1,
+                              1);
+       if (rc) {
+               pr_err("srp_transfer_data failed: %d\n", rc);
+               sd = se_cmd->sense_buffer;
+               se_cmd->scsi_sense_length = 18;
+               memset(se_cmd->sense_buffer, 0, se_cmd->scsi_sense_length);
+               /* Logical Unit Communication Time-out asc/ascq = 0x0801 */
+               scsi_build_sense_buffer(0, se_cmd->sense_buffer, MEDIUM_ERROR,
+                                       0x08, 0x01);
+       }
+
+       srp_build_response(vscsi, cmd, &len);
+       cmd->rsp.format = SRP_FORMAT;
+       cmd->rsp.len = len;
+
+       return 0;
+}
+
+static int ibmvscsis_queue_status(struct se_cmd *se_cmd)
+{
+       struct ibmvscsis_cmd *cmd = container_of(se_cmd, struct ibmvscsis_cmd,
+                                                se_cmd);
+       struct scsi_info *vscsi = cmd->adapter;
+       uint len;
+
+       pr_debug("queue_status %p\n", se_cmd);
+
+       srp_build_response(vscsi, cmd, &len);
+       cmd->rsp.format = SRP_FORMAT;
+       cmd->rsp.len = len;
+
+       return 0;
+}
+
+static void ibmvscsis_queue_tm_rsp(struct se_cmd *se_cmd)
+{
+       struct ibmvscsis_cmd *cmd = container_of(se_cmd, struct ibmvscsis_cmd,
+                                                se_cmd);
+       struct scsi_info *vscsi = cmd->adapter;
+       uint len;
+
+       pr_debug("queue_tm_rsp %p, status %d\n",
+                se_cmd, (int)se_cmd->se_tmr_req->response);
+
+       srp_build_response(vscsi, cmd, &len);
+       cmd->rsp.format = SRP_FORMAT;
+       cmd->rsp.len = len;
+}
+
+static void ibmvscsis_aborted_task(struct se_cmd *se_cmd)
+{
+       /* TBD: What (if anything) should we do here? */
+       pr_debug("ibmvscsis_aborted_task %p\n", se_cmd);
+}
+
+static struct se_wwn *ibmvscsis_make_tport(struct target_fabric_configfs *tf,
+                                          struct config_group *group,
+                                          const char *name)
+{
+       struct ibmvscsis_tport *tport;
+
+       tport = ibmvscsis_lookup_port(name);
+       if (tport) {
+               tport->tport_proto_id = SCSI_PROTOCOL_SRP;
+               pr_debug("make_tport(%s), pointer:%p, tport_id:%x\n",
+                        name, tport, tport->tport_proto_id);
+               return &tport->tport_wwn;
+       }
+
+       return ERR_PTR(-EINVAL);
+}
+
+static void ibmvscsis_drop_tport(struct se_wwn *wwn)
+{
+       struct ibmvscsis_tport *tport = container_of(wwn,
+                                                    struct ibmvscsis_tport,
+                                                    tport_wwn);
+
+       pr_debug("drop_tport(%s)\n",
+                config_item_name(&tport->tport_wwn.wwn_group.cg_item));
+}
+
+static struct se_portal_group *ibmvscsis_make_tpg(struct se_wwn *wwn,
+                                                 struct config_group *group,
+                                                 const char *name)
+{
+       struct ibmvscsis_tport *tport =
+               container_of(wwn, struct ibmvscsis_tport, tport_wwn);
+       int rc;
+
+       tport->releasing = false;
+
+       rc = core_tpg_register(&tport->tport_wwn, &tport->se_tpg,
+                              tport->tport_proto_id);
+       if (rc)
+               return ERR_PTR(rc);
+
+       return &tport->se_tpg;
+}
+
+static void ibmvscsis_drop_tpg(struct se_portal_group *se_tpg)
+{
+       struct ibmvscsis_tport *tport = container_of(se_tpg,
+                                                    struct ibmvscsis_tport,
+                                                    se_tpg);
+
+       tport->releasing = true;
+       tport->enabled = false;
+
+       /*
+        * Release the virtual I_T Nexus for this ibmvscsis TPG
+        */
+       ibmvscsis_drop_nexus(tport);
+       /*
+        * Deregister the se_tpg from TCM..
+        */
+       core_tpg_deregister(se_tpg);
+}
+
+static ssize_t ibmvscsis_wwn_version_show(struct config_item *item,
+                                         char *page)
+{
+       return scnprintf(page, PAGE_SIZE, "%s\n", IBMVSCSIS_VERSION);
+}
+CONFIGFS_ATTR_RO(ibmvscsis_wwn_, version);
+
+static struct configfs_attribute *ibmvscsis_wwn_attrs[] = {
+       &ibmvscsis_wwn_attr_version,
+       NULL,
+};
+
+static ssize_t ibmvscsis_tpg_enable_show(struct config_item *item,
+                                        char *page)
+{
+       struct se_portal_group *se_tpg = to_tpg(item);
+       struct ibmvscsis_tport *tport = container_of(se_tpg,
+                                                    struct ibmvscsis_tport,
+                                                    se_tpg);
+
+       return snprintf(page, PAGE_SIZE, "%d\n", (tport->enabled) ? 1 : 0);
+}
+
+static ssize_t ibmvscsis_tpg_enable_store(struct config_item *item,
+                                         const char *page, size_t count)
+{
+       struct se_portal_group *se_tpg = to_tpg(item);
+       struct ibmvscsis_tport *tport = container_of(se_tpg,
+                                                    struct ibmvscsis_tport,
+                                                    se_tpg);
+       struct scsi_info *vscsi = container_of(tport, struct scsi_info, tport);
+       unsigned long tmp;
+       int rc;
+       long lrc;
+
+       rc = kstrtoul(page, 0, &tmp);
+       if (rc < 0) {
+               pr_err("Unable to extract srpt_tpg_store_enable\n");
+               return -EINVAL;
+       }
+
+       if ((tmp != 0) && (tmp != 1)) {
+               pr_err("Illegal value for srpt_tpg_store_enable\n");
+               return -EINVAL;
+       }
+
+       if (tmp) {
+               tport->enabled = true;
+               spin_lock_bh(&vscsi->intr_lock);
+               lrc = ibmvscsis_enable_change_state(vscsi);
+               if (lrc)
+                       pr_err("enable_change_state failed, rc %ld state %d\n",
+                              lrc, vscsi->state);
+               spin_unlock_bh(&vscsi->intr_lock);
+       } else {
+               tport->enabled = false;
+       }
+
+       pr_debug("tpg_enable_store, state %d\n", vscsi->state);
+
+       return count;
+}
+CONFIGFS_ATTR(ibmvscsis_tpg_, enable);
+
+static struct configfs_attribute *ibmvscsis_tpg_attrs[] = {
+       &ibmvscsis_tpg_attr_enable,
+       NULL,
+};
+
+static const struct target_core_fabric_ops ibmvscsis_ops = {
+       .module                         = THIS_MODULE,
+       .name                           = "ibmvscsis",
+       .get_fabric_name                = ibmvscsis_get_fabric_name,
+       .tpg_get_wwn                    = ibmvscsis_get_fabric_wwn,
+       .tpg_get_tag                    = ibmvscsis_get_tag,
+       .tpg_get_default_depth          = ibmvscsis_get_default_depth,
+       .tpg_check_demo_mode            = ibmvscsis_check_true,
+       .tpg_check_demo_mode_cache      = ibmvscsis_check_true,
+       .tpg_check_demo_mode_write_protect = ibmvscsis_check_false,
+       .tpg_check_prod_mode_write_protect = ibmvscsis_check_false,
+       .tpg_get_inst_index             = ibmvscsis_tpg_get_inst_index,
+       .check_stop_free                = ibmvscsis_check_stop_free,
+       .release_cmd                    = ibmvscsis_release_cmd,
+       .sess_get_index                 = ibmvscsis_sess_get_index,
+       .write_pending                  = ibmvscsis_write_pending,
+       .write_pending_status           = ibmvscsis_write_pending_status,
+       .set_default_node_attributes    = ibmvscsis_set_default_node_attrs,
+       .get_cmd_state                  = ibmvscsis_get_cmd_state,
+       .queue_data_in                  = ibmvscsis_queue_data_in,
+       .queue_status                   = ibmvscsis_queue_status,
+       .queue_tm_rsp                   = ibmvscsis_queue_tm_rsp,
+       .aborted_task                   = ibmvscsis_aborted_task,
+       /*
+        * Setup function pointers for logic in target_core_fabric_configfs.c
+        */
+       .fabric_make_wwn                = ibmvscsis_make_tport,
+       .fabric_drop_wwn                = ibmvscsis_drop_tport,
+       .fabric_make_tpg                = ibmvscsis_make_tpg,
+       .fabric_drop_tpg                = ibmvscsis_drop_tpg,
+
+       .tfc_wwn_attrs                  = ibmvscsis_wwn_attrs,
+       .tfc_tpg_base_attrs             = ibmvscsis_tpg_attrs,
+};
+
+static void ibmvscsis_dev_release(struct device *dev) {};
+
+static struct class_attribute ibmvscsis_class_attrs[] = {
+       __ATTR_NULL,
+};
+
+static struct device_attribute dev_attr_system_id =
+       __ATTR(system_id, S_IRUGO, system_id_show, NULL);
+
+static struct device_attribute dev_attr_partition_number =
+       __ATTR(partition_number, S_IRUGO, partition_number_show, NULL);
+
+static struct device_attribute dev_attr_unit_address =
+       __ATTR(unit_address, S_IRUGO, unit_address_show, NULL);
+
+static struct attribute *ibmvscsis_dev_attrs[] = {
+       &dev_attr_system_id.attr,
+       &dev_attr_partition_number.attr,
+       &dev_attr_unit_address.attr,
+};
+ATTRIBUTE_GROUPS(ibmvscsis_dev);
+
+static struct class ibmvscsis_class = {
+       .name           = "ibmvscsis",
+       .dev_release    = ibmvscsis_dev_release,
+       .class_attrs    = ibmvscsis_class_attrs,
+       .dev_groups     = ibmvscsis_dev_groups,
+};
+
+static struct vio_device_id ibmvscsis_device_table[] = {
+       { "v-scsi-host", "IBM,v-scsi-host" },
+       { "", "" }
+};
+MODULE_DEVICE_TABLE(vio, ibmvscsis_device_table);
+
+static struct vio_driver ibmvscsis_driver = {
+       .name = "ibmvscsis",
+       .id_table = ibmvscsis_device_table,
+       .probe = ibmvscsis_probe,
+       .remove = ibmvscsis_remove,
+};
+
+/*
+ * ibmvscsis_init() - Kernel Module initialization
+ *
+ * Note: vio_register_driver() registers callback functions, and at least one
+ * of those callback functions calls TCM - Linux IO Target Subsystem, thus
+ * the SCSI Target template must be registered before vio_register_driver()
+ * is called.
+ */
+static int __init ibmvscsis_init(void)
+{
+       int rc = 0;
+
+       rc = ibmvscsis_get_system_info();
+       if (rc) {
+               pr_err("rc %d from get_system_info\n", rc);
+               goto out;
+       }
+
+       rc = class_register(&ibmvscsis_class);
+       if (rc) {
+               pr_err("failed class register\n");
+               goto out;
+       }
+
+       rc = target_register_template(&ibmvscsis_ops);
+       if (rc) {
+               pr_err("rc %d from target_register_template\n", rc);
+               goto unregister_class;
+       }
+
+       rc = vio_register_driver(&ibmvscsis_driver);
+       if (rc) {
+               pr_err("rc %d from vio_register_driver\n", rc);
+               goto unregister_target;
+       }
+
+       return 0;
+
+unregister_target:
+       target_unregister_template(&ibmvscsis_ops);
+unregister_class:
+       class_unregister(&ibmvscsis_class);
+out:
+       return rc;
+}
+
+static void __exit ibmvscsis_exit(void)
+{
+       pr_info("Unregister IBM virtual SCSI host driver\n");
+       vio_unregister_driver(&ibmvscsis_driver);
+       target_unregister_template(&ibmvscsis_ops);
+       class_unregister(&ibmvscsis_class);
+}
+
+MODULE_DESCRIPTION("IBMVSCSIS fabric driver");
+MODULE_AUTHOR("Bryant G. Ly and Michael Cyr");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(IBMVSCSIS_VERSION);
+module_init(ibmvscsis_init);
+module_exit(ibmvscsis_exit);
diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.h b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.h

new file mode 100644 (file)

index 0000000..981a0c9
--- /dev/null
+++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.h
@@ -0,0 +1,346 @@
+/*******************************************************************************
+ * IBM Virtual SCSI Target Driver
+ * Copyright (C) 2003-2005 Dave Boutcher (boutcher@us.ibm.com) IBM Corp.
+ *                        Santiago Leon (santil@us.ibm.com) IBM Corp.
+ *                        Linda Xie (lxie@us.ibm.com) IBM Corp.
+ *
+ * Copyright (C) 2005-2011 FUJITA Tomonori <tomof@acm.org>
+ * Copyright (C) 2010 Nicholas A. Bellinger <nab@kernel.org>
+ * Copyright (C) 2016 Bryant G. Ly <bryantly@linux.vnet.ibm.com> IBM Corp.
+ *
+ * Authors: Bryant G. Ly <bryantly@linux.vnet.ibm.com>
+ * Authors: Michael Cyr <mikecyr@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ ****************************************************************************/
+
+#ifndef __H_IBMVSCSI_TGT
+#define __H_IBMVSCSI_TGT
+
+#include "libsrp.h"
+
+#define SYS_ID_NAME_LEN                64
+#define PARTITION_NAMELEN      96
+#define IBMVSCSIS_NAMELEN       32
+
+#define MSG_HI  0
+#define MSG_LOW 1
+
+#define MAX_CMD_Q_PAGES       4
+#define CRQ_PER_PAGE          (PAGE_SIZE / sizeof(struct viosrp_crq))
+/* in terms of number of elements */
+#define DEFAULT_CMD_Q_SIZE    CRQ_PER_PAGE
+#define MAX_CMD_Q_SIZE        (DEFAULT_CMD_Q_SIZE * MAX_CMD_Q_PAGES)
+
+#define SRP_VIOLATION           0x102  /* general error code */
+
+/*
+ * SRP buffer formats defined as of 16.a supported by this driver.
+ */
+#define SUPPORTED_FORMATS  ((SRP_DATA_DESC_DIRECT << 1) | \
+                           (SRP_DATA_DESC_INDIRECT << 1))
+
+#define SCSI_LUN_ADDR_METHOD_FLAT      1
+
+struct dma_window {
+       u32 liobn;      /* Unique per vdevice */
+       u64 tce_base;   /* Physical location of the TCE table */
+       u64 tce_size;   /* Size of the TCE table in bytes */
+};
+
+struct target_dds {
+       u64 unit_id;                /* 64 bit will force alignment */
+#define NUM_DMA_WINDOWS 2
+#define LOCAL  0
+#define REMOTE 1
+       struct dma_window  window[NUM_DMA_WINDOWS];
+
+       /* root node property "ibm,partition-no" */
+       uint partition_num;
+       char partition_name[PARTITION_NAMELEN];
+};
+
+#define MAX_NUM_PORTS        1
+#define MAX_H_COPY_RDMA      (128 * 1024)
+
+#define MAX_EYE   64
+
+/* Return codes */
+#define ADAPT_SUCCESS            0L
+/* choose error codes that do not conflict with PHYP */
+#define ERROR                   -40L
+
+struct format_code {
+       u8 reserved;
+       u8 buffers;
+};
+
+struct client_info {
+#define SRP_VERSION "16.a"
+       char srp_version[8];
+       /* root node property ibm,partition-name */
+       char partition_name[PARTITION_NAMELEN];
+       /* root node property ibm,partition-no */
+       u32 partition_number;
+       /* initially 1 */
+       u32 mad_version;
+       u32 os_type;
+};
+
+/*
+ * Changing this constant changes the number of seconds to wait before
+ * considering the client will never service its queue again.
+ */
+#define SECONDS_TO_CONSIDER_FAILED 30
+/*
+ * These constants set the polling period used to determine if the client
+ * has freed at least one element in the response queue.
+ */
+#define WAIT_SECONDS 1
+#define WAIT_NANO_SECONDS 5000
+#define MAX_TIMER_POPS ((1000000 / WAIT_NANO_SECONDS) * \
+                       SECONDS_TO_CONSIDER_FAILED)
+/*
+ * general purpose timer control block
+ * which can be used for multiple functions
+ */
+struct timer_cb {
+       struct hrtimer timer;
+       /*
+        * how long has it been since the client
+        * serviced the queue. The variable is incrmented
+        * in the service_wait_q routine and cleared
+        * in send messages
+        */
+       int timer_pops;
+       /* the timer is started */
+       bool started;
+};
+
+struct cmd_queue {
+       /* kva */
+       struct viosrp_crq *base_addr;
+       dma_addr_t crq_token;
+       /* used to maintain index */
+       uint mask;
+       /* current element */
+       uint index;
+       int size;
+};
+
+#define SCSOLNT_RESP_SHIFT     1
+#define UCSOLNT_RESP_SHIFT     2
+
+#define SCSOLNT         BIT(SCSOLNT_RESP_SHIFT)
+#define UCSOLNT         BIT(UCSOLNT_RESP_SHIFT)
+
+enum cmd_type {
+       SCSI_CDB        = 0x01,
+       TASK_MANAGEMENT = 0x02,
+       /* MAD or addressed to port 0 */
+       ADAPTER_MAD     = 0x04,
+       UNSET_TYPE      = 0x08,
+};
+
+struct iu_rsp {
+       u8 format;
+       u8 sol_not;
+       u16 len;
+       /* tag is just to help client identify cmd, so don't translate be/le */
+       u64 tag;
+};
+
+struct ibmvscsis_cmd {
+       struct list_head list;
+       /* Used for TCM Core operations */
+       struct se_cmd se_cmd;
+       struct iu_entry *iue;
+       struct iu_rsp rsp;
+       struct work_struct work;
+       struct scsi_info *adapter;
+       /* Sense buffer that will be mapped into outgoing status */
+       unsigned char sense_buf[TRANSPORT_SENSE_BUFFER];
+       u64 init_time;
+#define CMD_FAST_FAIL  BIT(0)
+       u32 flags;
+       char type;
+};
+
+struct ibmvscsis_nexus {
+       struct se_session *se_sess;
+};
+
+struct ibmvscsis_tport {
+       /* SCSI protocol the tport is providing */
+       u8 tport_proto_id;
+       /* ASCII formatted WWPN for SRP Target port */
+       char tport_name[IBMVSCSIS_NAMELEN];
+       /* Returned by ibmvscsis_make_tport() */
+       struct se_wwn tport_wwn;
+       /* Returned by ibmvscsis_make_tpg() */
+       struct se_portal_group se_tpg;
+       /* ibmvscsis port target portal group tag for TCM */
+       u16 tport_tpgt;
+       /* Pointer to TCM session for I_T Nexus */
+       struct ibmvscsis_nexus *ibmv_nexus;
+       bool enabled;
+       bool releasing;
+};
+
+struct scsi_info {
+       struct list_head list;
+       char eye[MAX_EYE];
+
+       /* commands waiting for space on repsonse queue */
+       struct list_head waiting_rsp;
+#define NO_QUEUE                    0x00
+#define WAIT_ENABLED                0X01
+       /* driver has received an initialize command */
+#define PART_UP_WAIT_ENAB           0x02
+#define WAIT_CONNECTION             0x04
+       /* have established a connection */
+#define CONNECTED                   0x08
+       /* at least one port is processing SRP IU */
+#define SRP_PROCESSING              0x10
+       /* remove request received */
+#define UNCONFIGURING               0x20
+       /* disconnect by letting adapter go idle, no error */
+#define WAIT_IDLE                   0x40
+       /* disconnecting to clear an error */
+#define ERR_DISCONNECT              0x80
+       /* disconnect to clear error state, then come back up */
+#define ERR_DISCONNECT_RECONNECT    0x100
+       /* disconnected after clearing an error */
+#define ERR_DISCONNECTED            0x200
+       /* A series of errors caused unexpected errors */
+#define UNDEFINED                   0x400
+       u16  state;
+       int fast_fail;
+       struct target_dds dds;
+       char *cmd_pool;
+       /* list of free commands */
+       struct list_head free_cmd;
+       /* command elements ready for scheduler */
+       struct list_head schedule_q;
+       /* commands sent to TCM */
+       struct list_head active_q;
+       caddr_t *map_buf;
+       /* ioba of map buffer */
+       dma_addr_t map_ioba;
+       /* allowable number of outstanding SRP requests */
+       int request_limit;
+       /* extra credit */
+       int credit;
+       /* outstanding transactions against credit limit */
+       int debit;
+
+       /* allow only one outstanding mad request */
+#define PROCESSING_MAD                0x00002
+       /* Waiting to go idle */
+#define WAIT_FOR_IDLE                0x00004
+       /* H_REG_CRQ called */
+#define CRQ_CLOSED                    0x00010
+       /* detected that client has failed */
+#define CLIENT_FAILED                 0x00040
+       /* detected that transport event occurred */
+#define TRANS_EVENT                   0x00080
+       /* don't attempt to send anything to the client */
+#define RESPONSE_Q_DOWN               0x00100
+       /* request made to schedule disconnect handler */
+#define SCHEDULE_DISCONNECT           0x00400
+       /* disconnect handler is scheduled */
+#define DISCONNECT_SCHEDULED          0x00800
+       u32 flags;
+       /* adapter lock */
+       spinlock_t intr_lock;
+       /* information needed to manage command queue */
+       struct cmd_queue cmd_q;
+       /* used in hcall to copy response back into srp buffer */
+       u64  empty_iu_id;
+       /* used in crq, to tag what iu the response is for */
+       u64  empty_iu_tag;
+       uint new_state;
+       /* control block for the response queue timer */
+       struct timer_cb rsp_q_timer;
+       /* keep last client to enable proper accounting */
+       struct client_info client_data;
+       /* what can this client do */
+       u32 client_cap;
+       /*
+        * The following two fields capture state and flag changes that
+        * can occur when the lock is given up.  In the orginal design,
+        * the lock was held during calls into phyp;
+        * however, phyp did not meet PAPR architecture.  This is
+        * a work around.
+        */
+       u16  phyp_acr_state;
+       u32 phyp_acr_flags;
+
+       struct workqueue_struct *work_q;
+       struct completion wait_idle;
+       struct device dev;
+       struct vio_dev *dma_dev;
+       struct srp_target target;
+       struct ibmvscsis_tport tport;
+       struct tasklet_struct work_task;
+       struct work_struct proc_work;
+};
+
+/*
+ * Provide a constant that allows software to detect the adapter is
+ * disconnecting from the client from one of several states.
+ */
+#define IS_DISCONNECTING (UNCONFIGURING | ERR_DISCONNECT_RECONNECT | \
+                         ERR_DISCONNECT)
+
+/*
+ * Provide a constant that can be used with interrupt handling that
+ * essentially lets the interrupt handler know that all requests should
+ * be thrown out,
+ */
+#define DONT_PROCESS_STATE (IS_DISCONNECTING | UNDEFINED | \
+                           ERR_DISCONNECTED  | WAIT_IDLE)
+
+/*
+ * If any of these flag bits are set then do not allow the interrupt
+ * handler to schedule the off level handler.
+ */
+#define BLOCK (DISCONNECT_SCHEDULED)
+
+/* State and transition events that stop the interrupt handler */
+#define TARGET_STOP(VSCSI) (long)(((VSCSI)->state & DONT_PROCESS_STATE) | \
+                                 ((VSCSI)->flags & BLOCK))
+
+/* flag bit that are not reset during disconnect */
+#define PRESERVE_FLAG_FIELDS 0
+
+#define vio_iu(IUE) ((union viosrp_iu *)((IUE)->sbuf->buf))
+
+#define READ_CMD(cdb)  (((cdb)[0] & 0x1F) == 8)
+#define WRITE_CMD(cdb) (((cdb)[0] & 0x1F) == 0xA)
+
+#ifndef H_GET_PARTNER_INFO
+#define H_GET_PARTNER_INFO      0x0000000000000008LL
+#endif
+
+#define h_copy_rdma(l, sa, sb, da, db) \
+               plpar_hcall_norets(H_COPY_RDMA, l, sa, sb, da, db)
+#define h_vioctl(u, o, a, u1, u2, u3, u4) \
+               plpar_hcall_norets(H_VIOCTL, u, o, a, u1, u2)
+#define h_reg_crq(ua, tok, sz) \
+               plpar_hcall_norets(H_REG_CRQ, ua, tok, sz)
+#define h_free_crq(ua) \
+               plpar_hcall_norets(H_FREE_CRQ, ua)
+#define h_send_crq(ua, d1, d2) \
+               plpar_hcall_norets(H_SEND_CRQ, ua, d1, d2)
+
+#endif
diff --git a/drivers/scsi/ibmvscsi_tgt/libsrp.c b/drivers/scsi/ibmvscsi_tgt/libsrp.c

new file mode 100644 (file)

index 0000000..5a4cc28
--- /dev/null
+++ b/drivers/scsi/ibmvscsi_tgt/libsrp.c
@@ -0,0 +1,427 @@
+/*******************************************************************************
+ * SCSI RDMA Protocol lib functions
+ *
+ * Copyright (C) 2006 FUJITA Tomonori <tomof@acm.org>
+ * Copyright (C) 2016 Bryant G. Ly <bryantly@linux.vnet.ibm.com> IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ ***********************************************************************/
+
+#define pr_fmt(fmt)    "libsrp: " fmt
+
+#include <linux/printk.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/kfifo.h>
+#include <linux/scatterlist.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+#include <scsi/srp.h>
+#include <target/target_core_base.h>
+#include "libsrp.h"
+#include "ibmvscsi_tgt.h"
+
+static int srp_iu_pool_alloc(struct srp_queue *q, size_t max,
+                            struct srp_buf **ring)
+{
+       struct iu_entry *iue;
+       int i;
+
+       q->pool = kcalloc(max, sizeof(struct iu_entry *), GFP_KERNEL);
+       if (!q->pool)
+               return -ENOMEM;
+       q->items = kcalloc(max, sizeof(struct iu_entry), GFP_KERNEL);
+       if (!q->items)
+               goto free_pool;
+
+       spin_lock_init(&q->lock);
+       kfifo_init(&q->queue, (void *)q->pool, max * sizeof(void *));
+
+       for (i = 0, iue = q->items; i < max; i++) {
+               kfifo_in(&q->queue, (void *)&iue, sizeof(void *));
+               iue->sbuf = ring[i];
+               iue++;
+       }
+       return 0;
+
+free_pool:
+       kfree(q->pool);
+       return -ENOMEM;
+}
+
+static void srp_iu_pool_free(struct srp_queue *q)
+{
+       kfree(q->items);
+       kfree(q->pool);
+}
+
+static struct srp_buf **srp_ring_alloc(struct device *dev,
+                                      size_t max, size_t size)
+{
+       struct srp_buf **ring;
+       int i;
+
+       ring = kcalloc(max, sizeof(struct srp_buf *), GFP_KERNEL);
+       if (!ring)
+               return NULL;
+
+       for (i = 0; i < max; i++) {
+               ring[i] = kzalloc(sizeof(*ring[i]), GFP_KERNEL);
+               if (!ring[i])
+                       goto out;
+               ring[i]->buf = dma_alloc_coherent(dev, size, &ring[i]->dma,
+                                                 GFP_KERNEL);
+               if (!ring[i]->buf)
+                       goto out;
+       }
+       return ring;
+
+out:
+       for (i = 0; i < max && ring[i]; i++) {
+               if (ring[i]->buf) {
+                       dma_free_coherent(dev, size, ring[i]->buf,
+                                         ring[i]->dma);
+               }
+               kfree(ring[i]);
+       }
+       kfree(ring);
+
+       return NULL;
+}
+
+static void srp_ring_free(struct device *dev, struct srp_buf **ring,
+                         size_t max, size_t size)
+{
+       int i;
+
+       for (i = 0; i < max; i++) {
+               dma_free_coherent(dev, size, ring[i]->buf, ring[i]->dma);
+               kfree(ring[i]);
+       }
+       kfree(ring);
+}
+
+int srp_target_alloc(struct srp_target *target, struct device *dev,
+                    size_t nr, size_t iu_size)
+{
+       int err;
+
+       spin_lock_init(&target->lock);
+
+       target->dev = dev;
+
+       target->srp_iu_size = iu_size;
+       target->rx_ring_size = nr;
+       target->rx_ring = srp_ring_alloc(target->dev, nr, iu_size);
+       if (!target->rx_ring)
+               return -ENOMEM;
+       err = srp_iu_pool_alloc(&target->iu_queue, nr, target->rx_ring);
+       if (err)
+               goto free_ring;
+
+       dev_set_drvdata(target->dev, target);
+       return 0;
+
+free_ring:
+       srp_ring_free(target->dev, target->rx_ring, nr, iu_size);
+       return -ENOMEM;
+}
+
+void srp_target_free(struct srp_target *target)
+{
+       dev_set_drvdata(target->dev, NULL);
+       srp_ring_free(target->dev, target->rx_ring, target->rx_ring_size,
+                     target->srp_iu_size);
+       srp_iu_pool_free(&target->iu_queue);
+}
+
+struct iu_entry *srp_iu_get(struct srp_target *target)
+{
+       struct iu_entry *iue = NULL;
+
+       if (kfifo_out_locked(&target->iu_queue.queue, (void *)&iue,
+                            sizeof(void *),
+                            &target->iu_queue.lock) != sizeof(void *)) {
+               WARN_ONCE(1, "unexpected fifo state");
+               return NULL;
+       }
+       if (!iue)
+               return iue;
+       iue->target = target;
+       iue->flags = 0;
+       return iue;
+}
+
+void srp_iu_put(struct iu_entry *iue)
+{
+       kfifo_in_locked(&iue->target->iu_queue.queue, (void *)&iue,
+                       sizeof(void *), &iue->target->iu_queue.lock);
+}
+
+static int srp_direct_data(struct ibmvscsis_cmd *cmd, struct srp_direct_buf *md,
+                          enum dma_data_direction dir, srp_rdma_t rdma_io,
+                          int dma_map, int ext_desc)
+{
+       struct iu_entry *iue = NULL;
+       struct scatterlist *sg = NULL;
+       int err, nsg = 0, len;
+
+       if (dma_map) {
+               iue = cmd->iue;
+               sg = cmd->se_cmd.t_data_sg;
+               nsg = dma_map_sg(iue->target->dev, sg, cmd->se_cmd.t_data_nents,
+                                DMA_BIDIRECTIONAL);
+               if (!nsg) {
+                       pr_err("fail to map %p %d\n", iue,
+                              cmd->se_cmd.t_data_nents);
+                       return 0;
+               }
+               len = min(cmd->se_cmd.data_length, be32_to_cpu(md->len));
+       } else {
+               len = be32_to_cpu(md->len);
+       }
+
+       err = rdma_io(cmd, sg, nsg, md, 1, dir, len);
+
+       if (dma_map)
+               dma_unmap_sg(iue->target->dev, sg, nsg, DMA_BIDIRECTIONAL);
+
+       return err;
+}
+
+static int srp_indirect_data(struct ibmvscsis_cmd *cmd, struct srp_cmd *srp_cmd,
+                            struct srp_indirect_buf *id,
+                            enum dma_data_direction dir, srp_rdma_t rdma_io,
+                            int dma_map, int ext_desc)
+{
+       struct iu_entry *iue = NULL;
+       struct srp_direct_buf *md = NULL;
+       struct scatterlist dummy, *sg = NULL;
+       dma_addr_t token = 0;
+       int err = 0;
+       int nmd, nsg = 0, len;
+
+       if (dma_map || ext_desc) {
+               iue = cmd->iue;
+               sg = cmd->se_cmd.t_data_sg;
+       }
+
+       nmd = be32_to_cpu(id->table_desc.len) / sizeof(struct srp_direct_buf);
+
+       if ((dir == DMA_FROM_DEVICE && nmd == srp_cmd->data_in_desc_cnt) ||
+           (dir == DMA_TO_DEVICE && nmd == srp_cmd->data_out_desc_cnt)) {
+               md = &id->desc_list[0];
+               goto rdma;
+       }
+
+       if (ext_desc && dma_map) {
+               md = dma_alloc_coherent(iue->target->dev,
+                                       be32_to_cpu(id->table_desc.len),
+                                       &token, GFP_KERNEL);
+               if (!md) {
+                       pr_err("Can't get dma memory %u\n",
+                              be32_to_cpu(id->table_desc.len));
+                       return -ENOMEM;
+               }
+
+               sg_init_one(&dummy, md, be32_to_cpu(id->table_desc.len));
+               sg_dma_address(&dummy) = token;
+               sg_dma_len(&dummy) = be32_to_cpu(id->table_desc.len);
+               err = rdma_io(cmd, &dummy, 1, &id->table_desc, 1, DMA_TO_DEVICE,
+                             be32_to_cpu(id->table_desc.len));
+               if (err) {
+                       pr_err("Error copying indirect table %d\n", err);
+                       goto free_mem;
+               }
+       } else {
+               pr_err("This command uses external indirect buffer\n");
+               return -EINVAL;
+       }
+
+rdma:
+       if (dma_map) {
+               nsg = dma_map_sg(iue->target->dev, sg, cmd->se_cmd.t_data_nents,
+                                DMA_BIDIRECTIONAL);
+               if (!nsg) {
+                       pr_err("fail to map %p %d\n", iue,
+                              cmd->se_cmd.t_data_nents);
+                       err = -EIO;
+                       goto free_mem;
+               }
+               len = min(cmd->se_cmd.data_length, be32_to_cpu(id->len));
+       } else {
+               len = be32_to_cpu(id->len);
+       }
+
+       err = rdma_io(cmd, sg, nsg, md, nmd, dir, len);
+
+       if (dma_map)
+               dma_unmap_sg(iue->target->dev, sg, nsg, DMA_BIDIRECTIONAL);
+
+free_mem:
+       if (token && dma_map) {
+               dma_free_coherent(iue->target->dev,
+                                 be32_to_cpu(id->table_desc.len), md, token);
+       }
+       return err;
+}
+
+static int data_out_desc_size(struct srp_cmd *cmd)
+{
+       int size = 0;
+       u8 fmt = cmd->buf_fmt >> 4;
+
+       switch (fmt) {
+       case SRP_NO_DATA_DESC:
+               break;
+       case SRP_DATA_DESC_DIRECT:
+               size = sizeof(struct srp_direct_buf);
+               break;
+       case SRP_DATA_DESC_INDIRECT:
+               size = sizeof(struct srp_indirect_buf) +
+                       sizeof(struct srp_direct_buf) * cmd->data_out_desc_cnt;
+               break;
+       default:
+               pr_err("client error. Invalid data_out_format %x\n", fmt);
+               break;
+       }
+       return size;
+}
+
+/*
+ * TODO: this can be called multiple times for a single command if it
+ * has very long data.
+ */
+int srp_transfer_data(struct ibmvscsis_cmd *cmd, struct srp_cmd *srp_cmd,
+                     srp_rdma_t rdma_io, int dma_map, int ext_desc)
+{
+       struct srp_direct_buf *md;
+       struct srp_indirect_buf *id;
+       enum dma_data_direction dir;
+       int offset, err = 0;
+       u8 format;
+
+       if (!cmd->se_cmd.t_data_nents)
+               return 0;
+
+       offset = srp_cmd->add_cdb_len & ~3;
+
+       dir = srp_cmd_direction(srp_cmd);
+       if (dir == DMA_FROM_DEVICE)
+               offset += data_out_desc_size(srp_cmd);
+
+       if (dir == DMA_TO_DEVICE)
+               format = srp_cmd->buf_fmt >> 4;
+       else
+               format = srp_cmd->buf_fmt & ((1U << 4) - 1);
+
+       switch (format) {
+       case SRP_NO_DATA_DESC:
+               break;
+       case SRP_DATA_DESC_DIRECT:
+               md = (struct srp_direct_buf *)(srp_cmd->add_data + offset);
+               err = srp_direct_data(cmd, md, dir, rdma_io, dma_map, ext_desc);
+               break;
+       case SRP_DATA_DESC_INDIRECT:
+               id = (struct srp_indirect_buf *)(srp_cmd->add_data + offset);
+               err = srp_indirect_data(cmd, srp_cmd, id, dir, rdma_io, dma_map,
+                                       ext_desc);
+               break;
+       default:
+               pr_err("Unknown format %d %x\n", dir, format);
+               err = -EINVAL;
+       }
+
+       return err;
+}
+
+u64 srp_data_length(struct srp_cmd *cmd, enum dma_data_direction dir)
+{
+       struct srp_direct_buf *md;
+       struct srp_indirect_buf *id;
+       u64 len = 0;
+       uint offset = cmd->add_cdb_len & ~3;
+       u8 fmt;
+
+       if (dir == DMA_TO_DEVICE) {
+               fmt = cmd->buf_fmt >> 4;
+       } else {
+               fmt = cmd->buf_fmt & ((1U << 4) - 1);
+               offset += data_out_desc_size(cmd);
+       }
+
+       switch (fmt) {
+       case SRP_NO_DATA_DESC:
+               break;
+       case SRP_DATA_DESC_DIRECT:
+               md = (struct srp_direct_buf *)(cmd->add_data + offset);
+               len = be32_to_cpu(md->len);
+               break;
+       case SRP_DATA_DESC_INDIRECT:
+               id = (struct srp_indirect_buf *)(cmd->add_data + offset);
+               len = be32_to_cpu(id->len);
+               break;
+       default:
+               pr_err("invalid data format %x\n", fmt);
+               break;
+       }
+       return len;
+}
+
+int srp_get_desc_table(struct srp_cmd *srp_cmd, enum dma_data_direction *dir,
+                      u64 *data_len)
+{
+       struct srp_indirect_buf *idb;
+       struct srp_direct_buf *db;
+       uint add_cdb_offset;
+       int rc;
+
+       /*
+        * The pointer computations below will only be compiled correctly
+        * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check
+        * whether srp_cmd::add_data has been declared as a byte pointer.
+        */
+       BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0)
+                    && !__same_type(srp_cmd->add_data[0], (u8)0));
+
+       BUG_ON(!dir);
+       BUG_ON(!data_len);
+
+       rc = 0;
+       *data_len = 0;
+
+       *dir = DMA_NONE;
+
+       if (srp_cmd->buf_fmt & 0xf)
+               *dir = DMA_FROM_DEVICE;
+       else if (srp_cmd->buf_fmt >> 4)
+               *dir = DMA_TO_DEVICE;
+
+       add_cdb_offset = srp_cmd->add_cdb_len & ~3;
+       if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) ||
+           ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) {
+               db = (struct srp_direct_buf *)(srp_cmd->add_data
+                                              + add_cdb_offset);
+               *data_len = be32_to_cpu(db->len);
+       } else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) ||
+                  ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) {
+               idb = (struct srp_indirect_buf *)(srp_cmd->add_data
+                                                 + add_cdb_offset);
+
+               *data_len = be32_to_cpu(idb->len);
+       }
+       return rc;
+}
+
+MODULE_DESCRIPTION("SCSI RDMA Protocol lib functions");
+MODULE_AUTHOR("FUJITA Tomonori");
+MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/ibmvscsi_tgt/libsrp.h b/drivers/scsi/ibmvscsi_tgt/libsrp.h

new file mode 100644 (file)

index 0000000..4696f33
--- /dev/null
+++ b/drivers/scsi/ibmvscsi_tgt/libsrp.h
@@ -0,0 +1,123 @@
+#ifndef __LIBSRP_H__
+#define __LIBSRP_H__
+
+#include <linux/list.h>
+#include <linux/kfifo.h>
+#include <scsi/srp.h>
+
+enum srp_valid {
+       INVALIDATE_CMD_RESP_EL = 0,
+       VALID_CMD_RESP_EL = 0x80,
+       VALID_INIT_MSG = 0xC0,
+       VALID_TRANS_EVENT = 0xFF
+};
+
+enum srp_format {
+       SRP_FORMAT = 1,
+       MAD_FORMAT = 2,
+       OS400_FORMAT = 3,
+       AIX_FORMAT = 4,
+       LINUX_FORMAT = 5,
+       MESSAGE_IN_CRQ = 6
+};
+
+enum srp_init_msg {
+       INIT_MSG = 1,
+       INIT_COMPLETE_MSG = 2
+};
+
+enum srp_trans_event {
+       UNUSED_FORMAT = 0,
+       PARTNER_FAILED = 1,
+       PARTNER_DEREGISTER = 2,
+       MIGRATED = 6
+};
+
+enum srp_status {
+       HEADER_DESCRIPTOR = 0xF1,
+       PING = 0xF5,
+       PING_RESPONSE = 0xF6
+};
+
+enum srp_mad_version {
+       MAD_VERSION_1 = 1
+};
+
+enum srp_os_type {
+       OS400 = 1,
+       LINUX = 2,
+       AIX = 3,
+       OFW = 4
+};
+
+enum srp_task_attributes {
+       SRP_SIMPLE_TASK = 0,
+       SRP_HEAD_TASK = 1,
+       SRP_ORDERED_TASK = 2,
+       SRP_ACA_TASK = 4
+};
+
+enum {
+       SRP_TASK_MANAGEMENT_FUNCTION_COMPLETE           = 0,
+       SRP_REQUEST_FIELDS_INVALID                      = 2,
+       SRP_TASK_MANAGEMENT_FUNCTION_NOT_SUPPORTED      = 4,
+       SRP_TASK_MANAGEMENT_FUNCTION_FAILED             = 5
+};
+
+struct srp_buf {
+       dma_addr_t dma;
+       void *buf;
+};
+
+struct srp_queue {
+       void *pool;
+       void *items;
+       struct kfifo queue;
+       spinlock_t lock;
+};
+
+struct srp_target {
+       struct device *dev;
+
+       spinlock_t lock;
+       struct list_head cmd_queue;
+
+       size_t srp_iu_size;
+       struct srp_queue iu_queue;
+       size_t rx_ring_size;
+       struct srp_buf **rx_ring;
+
+       void *ldata;
+};
+
+struct iu_entry {
+       struct srp_target *target;
+
+       struct list_head ilist;
+       dma_addr_t remote_token;
+       unsigned long flags;
+
+       struct srp_buf *sbuf;
+       u16 iu_len;
+};
+
+struct ibmvscsis_cmd;
+
+typedef int (srp_rdma_t)(struct ibmvscsis_cmd *, struct scatterlist *, int,
+                        struct srp_direct_buf *, int,
+                        enum dma_data_direction, unsigned int);
+int srp_target_alloc(struct srp_target *, struct device *, size_t, size_t);
+void srp_target_free(struct srp_target *);
+struct iu_entry *srp_iu_get(struct srp_target *);
+void srp_iu_put(struct iu_entry *);
+int srp_transfer_data(struct ibmvscsis_cmd *, struct srp_cmd *,
+                     srp_rdma_t, int, int);
+u64 srp_data_length(struct srp_cmd *cmd, enum dma_data_direction dir);
+int srp_get_desc_table(struct srp_cmd *srp_cmd, enum dma_data_direction *dir,
+                      u64 *data_len);
+static inline int srp_cmd_direction(struct srp_cmd *cmd)
+{
+       return (cmd->buf_fmt >> 4) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+}
+
+#endif
diff --git a/drivers/staging/emxx_udc/Kconfig b/drivers/staging/emxx_udc/Kconfig

index cc34020204874acb0156a7c7c4f82fe55e4c13f8..d7577096fb25ae7a7002ce2366cd3bacabfacfce 100644 (file)
--- a/drivers/staging/emxx_udc/Kconfig
+++ b/drivers/staging/emxx_udc/Kconfig
@@ -1,5 +1,5 @@
  config USB_EMXX
-       bool "EMXX USB Function Device Controller"
+       tristate "EMXX USB Function Device Controller"
         depends on USB_GADGET && (ARCH_SHMOBILE || (ARM && COMPILE_TEST))
         help
            The Emma Mobile series of SoCs from Renesas Electronics and
diff --git a/drivers/staging/emxx_udc/emxx_udc.c b/drivers/staging/emxx_udc/emxx_udc.c

index 3bd91758b2daaef1031874d44c8071de4e40e47c..3b56b2826263951fb8fbc19fafaf082426dbc9a0 100644 (file)
--- a/drivers/staging/emxx_udc/emxx_udc.c
+++ b/drivers/staging/emxx_udc/emxx_udc.c
@@ -15,7 +15,7 @@
   */
  
  #include <linux/kernel.h>
-#include <linux/init.h>
+#include <linux/module.h>
  #include <linux/platform_device.h>
  #include <linux/delay.h>
  #include <linux/ioport.h>
@@ -39,9 +39,11 @@
  
  #include "emxx_udc.h"
  
+#define        DRIVER_DESC     "EMXX UDC driver"
  #define        DMA_ADDR_INVALID        (~(dma_addr_t)0)
  
  static const char      driver_name[] = "emxx_udc";
+static const char      driver_desc[] = DRIVER_DESC;
  
  /*===========================================================================*/
  /* Prototype */
@@ -3295,6 +3297,28 @@ static void nbu2ss_drv_shutdown(struct platform_device *pdev)
         _nbu2ss_disable_controller(udc);
  }
  
+/*-------------------------------------------------------------------------*/
+static int nbu2ss_drv_remove(struct platform_device *pdev)
+{
+       struct nbu2ss_udc       *udc;
+       struct nbu2ss_ep        *ep;
+       int     i;
+
+       udc = &udc_controller;
+
+       for (i = 0; i < NUM_ENDPOINTS; i++) {
+               ep = &udc->ep[i];
+               if (ep->virt_buf)
+                       dma_free_coherent(NULL, PAGE_SIZE,
+                               (void *)ep->virt_buf, ep->phys_buf);
+       }
+
+       /* Interrupt Handler - Release */
+       free_irq(INT_VBUS, udc);
+
+       return 0;
+}
+
  /*-------------------------------------------------------------------------*/
  static int nbu2ss_drv_suspend(struct platform_device *pdev, pm_message_t state)
  {
@@ -3347,12 +3371,16 @@ static int nbu2ss_drv_resume(struct platform_device *pdev)
  static struct platform_driver udc_driver = {
         .probe          = nbu2ss_drv_probe,
         .shutdown       = nbu2ss_drv_shutdown,
+       .remove         = nbu2ss_drv_remove,
         .suspend        = nbu2ss_drv_suspend,
         .resume         = nbu2ss_drv_resume,
         .driver         = {
-               .name                   = driver_name,
-               .suppress_bind_attrs    = true,
+               .name   = driver_name,
         },
  };
  
-builtin_platform_driver(udc_driver);
+module_platform_driver(udc_driver);
+
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_AUTHOR("Renesas Electronics Corporation");
+MODULE_LICENSE("GPL");
diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c

index 50f3d3a0dd7b93d4e8789913997ebd67eeb70425..39b928c2849d71c47390ef55002dc767f7ae4e12 100644 (file)
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -492,7 +492,8 @@ void iscsit_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
         bool scsi_cmd = (cmd->iscsi_opcode == ISCSI_OP_SCSI_CMD);
  
         spin_lock_bh(&conn->cmd_lock);
-       if (!list_empty(&cmd->i_conn_node))
+       if (!list_empty(&cmd->i_conn_node) &&
+           !(cmd->se_cmd.transport_state & CMD_T_FABRIC_STOP))
                 list_del_init(&cmd->i_conn_node);
         spin_unlock_bh(&conn->cmd_lock);
  
@@ -4034,6 +4035,7 @@ int iscsi_target_rx_thread(void *arg)
  
  static void iscsit_release_commands_from_conn(struct iscsi_conn *conn)
  {
+       LIST_HEAD(tmp_list);
         struct iscsi_cmd *cmd = NULL, *cmd_tmp = NULL;
         struct iscsi_session *sess = conn->sess;
         /*
@@ -4042,18 +4044,26 @@ static void iscsit_release_commands_from_conn(struct iscsi_conn *conn)
          * has been reset -> returned sleeping pre-handler state.
          */
         spin_lock_bh(&conn->cmd_lock);
-       list_for_each_entry_safe(cmd, cmd_tmp, &conn->conn_cmd_list, i_conn_node) {
+       list_splice_init(&conn->conn_cmd_list, &tmp_list);
  
+       list_for_each_entry(cmd, &tmp_list, i_conn_node) {
+               struct se_cmd *se_cmd = &cmd->se_cmd;
+
+               if (se_cmd->se_tfo != NULL) {
+                       spin_lock(&se_cmd->t_state_lock);
+                       se_cmd->transport_state |= CMD_T_FABRIC_STOP;
+                       spin_unlock(&se_cmd->t_state_lock);
+               }
+       }
+       spin_unlock_bh(&conn->cmd_lock);
+
+       list_for_each_entry_safe(cmd, cmd_tmp, &tmp_list, i_conn_node) {
                 list_del_init(&cmd->i_conn_node);
-               spin_unlock_bh(&conn->cmd_lock);
  
                 iscsit_increment_maxcmdsn(cmd, sess);
-
                 iscsit_free_cmd(cmd, true);
  
-               spin_lock_bh(&conn->cmd_lock);
         }
-       spin_unlock_bh(&conn->cmd_lock);
  }
  
  static void iscsit_stop_timers_for_cmds(
diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c

index b5212f0f9571b5ce85c5c2413baac8f7d900b58a..adf419fa429189ceca94d04782221fc34de23b6c 100644 (file)
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -1371,8 +1371,9 @@ static int __iscsi_target_login_thread(struct iscsi_np *np)
         }
         login->zero_tsih = zero_tsih;
  
-       conn->sess->se_sess->sup_prot_ops =
-               conn->conn_transport->iscsit_get_sup_prot_ops(conn);
+       if (conn->sess)
+               conn->sess->se_sess->sup_prot_ops =
+                       conn->conn_transport->iscsit_get_sup_prot_ops(conn);
  
         tpg = conn->tpg;
         if (!tpg) {
diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c

index a4046ca6e60da85d5e0f489ae58981fd5ba45426..6b423485c5d6b4f6e8e54a332f1dd62eda0325d9 100644 (file)
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -821,13 +821,15 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
   * in ATA and we need to set TPE=1
   */
  bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
-                                      struct request_queue *q, int block_size)
+                                      struct request_queue *q)
  {
+       int block_size = queue_logical_block_size(q);
+
         if (!blk_queue_discard(q))
                 return false;
  
-       attrib->max_unmap_lba_count = (q->limits.max_discard_sectors << 9) /
-                                                               block_size;
+       attrib->max_unmap_lba_count =
+               q->limits.max_discard_sectors >> (ilog2(block_size) - 9);
         /*
          * Currently hardcoded to 1 in Linux/SCSI code..
          */
diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c

index 75f0f08b2a34f32d0b2bfdd35fa40f027e74ae75..d545993df18be9ede3253861e24c25726d9a8e27 100644 (file)
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -161,8 +161,7 @@ static int fd_configure_device(struct se_device *dev)
                         dev_size, div_u64(dev_size, fd_dev->fd_block_size),
                         fd_dev->fd_block_size);
  
-               if (target_configure_unmap_from_queue(&dev->dev_attrib, q,
-                                                     fd_dev->fd_block_size))
+               if (target_configure_unmap_from_queue(&dev->dev_attrib, q))
                         pr_debug("IFILE: BLOCK Discard support available,"
                                  " disabled by default\n");
                 /*
@@ -523,7 +522,7 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
          */
         if (cmd->data_length > FD_MAX_BYTES) {
                 pr_err("FILEIO: Not able to process I/O of %u bytes due to"
-                      "FD_MAX_BYTES: %u iovec count limitiation\n",
+                      "FD_MAX_BYTES: %u iovec count limitation\n",
                         cmd->data_length, FD_MAX_BYTES);
                 return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
         }
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c

index 22af12f8b8eb7ff13d079fb3291c680bb3d6dec0..47cf6c977367ed2b10314a0156f2b5deaaa1ca59 100644 (file)
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -121,8 +121,7 @@ static int iblock_configure_device(struct se_device *dev)
         dev->dev_attrib.hw_max_sectors = queue_max_hw_sectors(q);
         dev->dev_attrib.hw_queue_depth = q->nr_requests;
  
-       if (target_configure_unmap_from_queue(&dev->dev_attrib, q,
-                                             dev->dev_attrib.hw_block_size))
+       if (target_configure_unmap_from_queue(&dev->dev_attrib, q))
                 pr_debug("IBLOCK: BLOCK Discard support available,"
                          " disabled by default\n");
  
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h

index fc91e85f54ba683dd298afc6db1d907192cef598..e2c970a9d61c32c7a95d034f889992a7560fdb64 100644 (file)
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -146,6 +146,7 @@ sense_reason_t      target_cmd_size_check(struct se_cmd *cmd, unsigned int size);
  void   target_qf_do_work(struct work_struct *work);
  bool   target_check_wce(struct se_device *dev);
  bool   target_check_fua(struct se_device *dev);
+void   __target_execute_cmd(struct se_cmd *, bool);
  
  /* target_core_stat.c */
  void   target_stat_setup_dev_default_groups(struct se_device *);
diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c

index a9057aa07176ff240748372215333dd531296ca2..04f616b3ba0a848a80d4a70c084c1b45d406c168 100644 (file)
--- a/drivers/target/target_core_sbc.c
+++ b/drivers/target/target_core_sbc.c
@@ -602,7 +602,7 @@ static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool succes
         cmd->transport_state |= CMD_T_ACTIVE|CMD_T_BUSY|CMD_T_SENT;
         spin_unlock_irq(&cmd->t_state_lock);
  
-       __target_execute_cmd(cmd);
+       __target_execute_cmd(cmd, false);
  
         kfree(buf);
         return ret;
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c

index 5ab3967dda43ee989a6a4e5ccf1a553e67a7a0db..6094a6beddde9fb5d045644b6b11b32e6bd149c1 100644 (file)
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -754,7 +754,15 @@ EXPORT_SYMBOL(target_complete_cmd);
  
  void target_complete_cmd_with_length(struct se_cmd *cmd, u8 scsi_status, int length)
  {
-       if (scsi_status == SAM_STAT_GOOD && length < cmd->data_length) {
+       if (scsi_status != SAM_STAT_GOOD) {
+               return;
+       }
+
+       /*
+        * Calculate new residual count based upon length of SCSI data
+        * transferred.
+        */
+       if (length < cmd->data_length) {
                 if (cmd->se_cmd_flags & SCF_UNDERFLOW_BIT) {
                         cmd->residual_count += cmd->data_length - length;
                 } else {
@@ -763,6 +771,12 @@ void target_complete_cmd_with_length(struct se_cmd *cmd, u8 scsi_status, int len
                 }
  
                 cmd->data_length = length;
+       } else if (length > cmd->data_length) {
+               cmd->se_cmd_flags |= SCF_OVERFLOW_BIT;
+               cmd->residual_count = length - cmd->data_length;
+       } else {
+               cmd->se_cmd_flags &= ~(SCF_OVERFLOW_BIT | SCF_UNDERFLOW_BIT);
+               cmd->residual_count = 0;
         }
  
         target_complete_cmd(cmd, scsi_status);
@@ -1303,23 +1317,6 @@ target_setup_cmd_from_cdb(struct se_cmd *cmd, unsigned char *cdb)
  
         trace_target_sequencer_start(cmd);
  
-       /*
-        * Check for an existing UNIT ATTENTION condition
-        */
-       ret = target_scsi3_ua_check(cmd);
-       if (ret)
-               return ret;
-
-       ret = target_alua_state_check(cmd);
-       if (ret)
-               return ret;
-
-       ret = target_check_reservation(cmd);
-       if (ret) {
-               cmd->scsi_status = SAM_STAT_RESERVATION_CONFLICT;
-               return ret;
-       }
-
         ret = dev->transport->parse_cdb(cmd);
         if (ret == TCM_UNSUPPORTED_SCSI_OPCODE)
                 pr_warn_ratelimited("%s/%s: Unsupported SCSI Opcode 0x%02x, sending CHECK_CONDITION.\n",
@@ -1761,20 +1758,45 @@ queue_full:
  }
  EXPORT_SYMBOL(transport_generic_request_failure);
  
-void __target_execute_cmd(struct se_cmd *cmd)
+void __target_execute_cmd(struct se_cmd *cmd, bool do_checks)
  {
         sense_reason_t ret;
  
-       if (cmd->execute_cmd) {
-               ret = cmd->execute_cmd(cmd);
-               if (ret) {
-                       spin_lock_irq(&cmd->t_state_lock);
-                       cmd->transport_state &= ~(CMD_T_BUSY|CMD_T_SENT);
-                       spin_unlock_irq(&cmd->t_state_lock);
+       if (!cmd->execute_cmd) {
+               ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+               goto err;
+       }
+       if (do_checks) {
+               /*
+                * Check for an existing UNIT ATTENTION condition after
+                * target_handle_task_attr() has done SAM task attr
+                * checking, and possibly have already defered execution
+                * out to target_restart_delayed_cmds() context.
+                */
+               ret = target_scsi3_ua_check(cmd);
+               if (ret)
+                       goto err;
  
-                       transport_generic_request_failure(cmd, ret);
+               ret = target_alua_state_check(cmd);
+               if (ret)
+                       goto err;
+
+               ret = target_check_reservation(cmd);
+               if (ret) {
+                       cmd->scsi_status = SAM_STAT_RESERVATION_CONFLICT;
+                       goto err;
                 }
         }
+
+       ret = cmd->execute_cmd(cmd);
+       if (!ret)
+               return;
+err:
+       spin_lock_irq(&cmd->t_state_lock);
+       cmd->transport_state &= ~(CMD_T_BUSY|CMD_T_SENT);
+       spin_unlock_irq(&cmd->t_state_lock);
+
+       transport_generic_request_failure(cmd, ret);
  }
  
  static int target_write_prot_action(struct se_cmd *cmd)
@@ -1819,6 +1841,8 @@ static bool target_handle_task_attr(struct se_cmd *cmd)
         if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH)
                 return false;
  
+       cmd->se_cmd_flags |= SCF_TASK_ATTR_SET;
+
         /*
          * Check for the existence of HEAD_OF_QUEUE, and if true return 1
          * to allow the passed struct se_cmd list of tasks to the front of the list.
@@ -1899,7 +1923,7 @@ void target_execute_cmd(struct se_cmd *cmd)
                 return;
         }
  
-       __target_execute_cmd(cmd);
+       __target_execute_cmd(cmd, true);
  }
  EXPORT_SYMBOL(target_execute_cmd);
  
@@ -1923,7 +1947,7 @@ static void target_restart_delayed_cmds(struct se_device *dev)
                 list_del(&cmd->se_delayed_node);
                 spin_unlock(&dev->delayed_cmd_lock);
  
-               __target_execute_cmd(cmd);
+               __target_execute_cmd(cmd, true);
  
                 if (cmd->sam_task_attr == TCM_ORDERED_TAG)
                         break;
@@ -1941,6 +1965,9 @@ static void transport_complete_task_attr(struct se_cmd *cmd)
         if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH)
                 return;
  
+       if (!(cmd->se_cmd_flags & SCF_TASK_ATTR_SET))
+               goto restart;
+
         if (cmd->sam_task_attr == TCM_SIMPLE_TAG) {
                 atomic_dec_mb(&dev->simple_cmds);
                 dev->dev_cur_ordered_id++;
@@ -1957,7 +1984,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd)
                 pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED\n",
                          dev->dev_cur_ordered_id);
         }
-
+restart:
         target_restart_delayed_cmds(dev);
  }
  
@@ -2557,15 +2584,10 @@ static void target_release_cmd_kref(struct kref *kref)
         bool fabric_stop;
  
         spin_lock_irqsave(&se_sess->sess_cmd_lock, flags);
-       if (list_empty(&se_cmd->se_cmd_list)) {
-               spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags);
-               target_free_cmd_mem(se_cmd);
-               se_cmd->se_tfo->release_cmd(se_cmd);
-               return;
-       }
  
         spin_lock(&se_cmd->t_state_lock);
-       fabric_stop = (se_cmd->transport_state & CMD_T_FABRIC_STOP);
+       fabric_stop = (se_cmd->transport_state & CMD_T_FABRIC_STOP) &&
+                     (se_cmd->transport_state & CMD_T_ABORTED);
         spin_unlock(&se_cmd->t_state_lock);
  
         if (se_cmd->cmd_wait_set || fabric_stop) {
diff --git a/drivers/target/tcm_fc/tfc_sess.c b/drivers/target/tcm_fc/tfc_sess.c

index f5186a74439950631c48f9735e3d9fbed1699c30..6ffbb603d9122a0259daa69db5bcca03ba891aa5 100644 (file)
--- a/drivers/target/tcm_fc/tfc_sess.c
+++ b/drivers/target/tcm_fc/tfc_sess.c
@@ -91,6 +91,7 @@ static void ft_tport_delete(struct ft_tport *tport)
  
         ft_sess_delete_all(tport);
         lport = tport->lport;
+       lport->service_params &= ~FCP_SPPF_TARG_FCN;
         BUG_ON(tport != lport->prov[FC_TYPE_FCP]);
         RCU_INIT_POINTER(lport->prov[FC_TYPE_FCP], NULL);
  
@@ -110,6 +111,7 @@ void ft_lport_add(struct fc_lport *lport, void *arg)
  {
         mutex_lock(&ft_lport_lock);
         ft_tport_get(lport);
+       lport->service_params |= FCP_SPPF_TARG_FCN;
         mutex_unlock(&ft_lport_lock);
  }
  
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c

index 96a70789b4c215376914aaa1196785bff53ce312..4d6a5c672a3d1fd388c6f318c3000e1cfef354cb 100644 (file)
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -496,12 +496,10 @@ static int cp210x_write_reg_block(struct usb_serial_port *port, u8 req,
         void *dmabuf;
         int result;
  
-       dmabuf = kmalloc(bufsize, GFP_KERNEL);
+       dmabuf = kmemdup(buf, bufsize, GFP_KERNEL);
         if (!dmabuf)
                 return -ENOMEM;
  
-       memcpy(dmabuf, buf, bufsize);
-
         result = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0),
                         req, REQTYPE_HOST_TO_INTERFACE, 0,
                         port_priv->bInterfaceNumber, dmabuf, bufsize,
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c

index ae8c0365abd6a8c46cbac05943574751360ca175..944de657a07a8d2957a88a599474bff57cfdee62 100644 (file)
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -350,6 +350,7 @@ void usb_serial_generic_read_bulk_callback(struct urb *urb)
         struct usb_serial_port *port = urb->context;
         unsigned char *data = urb->transfer_buffer;
         unsigned long flags;
+       int status = urb->status;
         int i;
  
         for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) {
@@ -360,22 +361,22 @@ void usb_serial_generic_read_bulk_callback(struct urb *urb)
  
         dev_dbg(&port->dev, "%s - urb %d, len %d\n", __func__, i,
                                                         urb->actual_length);
-       switch (urb->status) {
+       switch (status) {
         case 0:
                 break;
         case -ENOENT:
         case -ECONNRESET:
         case -ESHUTDOWN:
                 dev_dbg(&port->dev, "%s - urb stopped: %d\n",
-                                                       __func__, urb->status);
+                                                       __func__, status);
                 return;
         case -EPIPE:
                 dev_err(&port->dev, "%s - urb stopped: %d\n",
-                                                       __func__, urb->status);
+                                                       __func__, status);
                 return;
         default:
                 dev_dbg(&port->dev, "%s - nonzero urb status: %d\n",
-                                                       __func__, urb->status);
+                                                       __func__, status);
                 goto resubmit;
         }
  
@@ -399,6 +400,7 @@ void usb_serial_generic_write_bulk_callback(struct urb *urb)
  {
         unsigned long flags;
         struct usb_serial_port *port = urb->context;
+       int status = urb->status;
         int i;
  
         for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) {
@@ -410,22 +412,22 @@ void usb_serial_generic_write_bulk_callback(struct urb *urb)
         set_bit(i, &port->write_urbs_free);
         spin_unlock_irqrestore(&port->lock, flags);
  
-       switch (urb->status) {
+       switch (status) {
         case 0:
                 break;
         case -ENOENT:
         case -ECONNRESET:
         case -ESHUTDOWN:
                 dev_dbg(&port->dev, "%s - urb stopped: %d\n",
-                                                       __func__, urb->status);
+                                                       __func__, status);
                 return;
         case -EPIPE:
                 dev_err_console(port, "%s - urb stopped: %d\n",
-                                                       __func__, urb->status);
+                                                       __func__, status);
                 return;
         default:
                 dev_err_console(port, "%s - nonzero urb status: %d\n",
-                                                       __func__, urb->status);
+                                                       __func__, status);
                 goto resubmit;
         }
  
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c

index d96d423d00e6cdc930281f4df71ea81c5fd280bd..8e07536c233a0c90b6a69ebac97cf9c38fe61416 100644 (file)
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -273,6 +273,7 @@ static void option_instat_callback(struct urb *urb);
  #define TELIT_PRODUCT_LE922_USBCFG5            0x1045
  #define TELIT_PRODUCT_LE920                    0x1200
  #define TELIT_PRODUCT_LE910                    0x1201
+#define TELIT_PRODUCT_LE910_USBCFG4            0x1206
  
  /* ZTE PRODUCTS */
  #define ZTE_VENDOR_ID                          0x19d2
@@ -1198,6 +1199,8 @@ static const struct usb_device_id option_ids[] = {
                 .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 },
         { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910),
                 .driver_info = (kernel_ulong_t)&telit_le910_blacklist },
+       { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4),
+               .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg3 },
         { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920),
                 .driver_info = (kernel_ulong_t)&telit_le920_blacklist },
         { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF622, 0xff, 0xff, 0xff) }, /* ZTE WCDMA products */
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c

index e7dbbef2af2a57cf3560af2b76e6b94e3d2d0144..07b4bf01061d86a6aafcdbb9d7ee7b6d331b9a05 100644 (file)
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -1,5 +1,4 @@
-/* vi: ts=8 sw=8
- *
+/*
   * TI 3410/5052 USB Serial Driver
   *
   * Copyright (C) 2004 Texas Instruments
@@ -35,9 +34,238 @@
  #include <linux/usb.h>
  #include <linux/usb/serial.h>
  
-#include "ti_usb_3410_5052.h"
-
-/* Defines */
+/* Configuration ids */
+#define TI_BOOT_CONFIG                 1
+#define TI_ACTIVE_CONFIG               2
+
+/* Vendor and product ids */
+#define TI_VENDOR_ID                   0x0451
+#define IBM_VENDOR_ID                  0x04b3
+#define TI_3410_PRODUCT_ID             0x3410
+#define IBM_4543_PRODUCT_ID            0x4543
+#define IBM_454B_PRODUCT_ID            0x454b
+#define IBM_454C_PRODUCT_ID            0x454c
+#define TI_3410_EZ430_ID               0xF430  /* TI ez430 development tool */
+#define TI_5052_BOOT_PRODUCT_ID                0x5052  /* no EEPROM, no firmware */
+#define TI_5152_BOOT_PRODUCT_ID                0x5152  /* no EEPROM, no firmware */
+#define TI_5052_EEPROM_PRODUCT_ID      0x505A  /* EEPROM, no firmware */
+#define TI_5052_FIRMWARE_PRODUCT_ID    0x505F  /* firmware is running */
+#define FRI2_PRODUCT_ID                        0x5053  /* Fish River Island II */
+
+/* Multi-Tech vendor and product ids */
+#define MTS_VENDOR_ID                  0x06E0
+#define MTS_GSM_NO_FW_PRODUCT_ID       0xF108
+#define MTS_CDMA_NO_FW_PRODUCT_ID      0xF109
+#define MTS_CDMA_PRODUCT_ID            0xF110
+#define MTS_GSM_PRODUCT_ID             0xF111
+#define MTS_EDGE_PRODUCT_ID            0xF112
+#define MTS_MT9234MU_PRODUCT_ID                0xF114
+#define MTS_MT9234ZBA_PRODUCT_ID       0xF115
+#define MTS_MT9234ZBAOLD_PRODUCT_ID    0x0319
+
+/* Abbott Diabetics vendor and product ids */
+#define ABBOTT_VENDOR_ID               0x1a61
+#define ABBOTT_STEREO_PLUG_ID          0x3410
+#define ABBOTT_PRODUCT_ID              ABBOTT_STEREO_PLUG_ID
+#define ABBOTT_STRIP_PORT_ID           0x3420
+
+/* Honeywell vendor and product IDs */
+#define HONEYWELL_VENDOR_ID            0x10ac
+#define HONEYWELL_HGI80_PRODUCT_ID     0x0102  /* Honeywell HGI80 */
+
+/* Moxa UPORT 11x0 vendor and product IDs */
+#define MXU1_VENDOR_ID                         0x110a
+#define MXU1_1110_PRODUCT_ID                   0x1110
+#define MXU1_1130_PRODUCT_ID                   0x1130
+#define MXU1_1150_PRODUCT_ID                   0x1150
+#define MXU1_1151_PRODUCT_ID                   0x1151
+#define MXU1_1131_PRODUCT_ID                   0x1131
+
+/* Commands */
+#define TI_GET_VERSION                 0x01
+#define TI_GET_PORT_STATUS             0x02
+#define TI_GET_PORT_DEV_INFO           0x03
+#define TI_GET_CONFIG                  0x04
+#define TI_SET_CONFIG                  0x05
+#define TI_OPEN_PORT                   0x06
+#define TI_CLOSE_PORT                  0x07
+#define TI_START_PORT                  0x08
+#define TI_STOP_PORT                   0x09
+#define TI_TEST_PORT                   0x0A
+#define TI_PURGE_PORT                  0x0B
+#define TI_RESET_EXT_DEVICE            0x0C
+#define TI_WRITE_DATA                  0x80
+#define TI_READ_DATA                   0x81
+#define TI_REQ_TYPE_CLASS              0x82
+
+/* Module identifiers */
+#define TI_I2C_PORT                    0x01
+#define TI_IEEE1284_PORT               0x02
+#define TI_UART1_PORT                  0x03
+#define TI_UART2_PORT                  0x04
+#define TI_RAM_PORT                    0x05
+
+/* Modem status */
+#define TI_MSR_DELTA_CTS               0x01
+#define TI_MSR_DELTA_DSR               0x02
+#define TI_MSR_DELTA_RI                        0x04
+#define TI_MSR_DELTA_CD                        0x08
+#define TI_MSR_CTS                     0x10
+#define TI_MSR_DSR                     0x20
+#define TI_MSR_RI                      0x40
+#define TI_MSR_CD                      0x80
+#define TI_MSR_DELTA_MASK              0x0F
+#define TI_MSR_MASK                    0xF0
+
+/* Line status */
+#define TI_LSR_OVERRUN_ERROR           0x01
+#define TI_LSR_PARITY_ERROR            0x02
+#define TI_LSR_FRAMING_ERROR           0x04
+#define TI_LSR_BREAK                   0x08
+#define TI_LSR_ERROR                   0x0F
+#define TI_LSR_RX_FULL                 0x10
+#define TI_LSR_TX_EMPTY                        0x20
+
+/* Line control */
+#define TI_LCR_BREAK                   0x40
+
+/* Modem control */
+#define TI_MCR_LOOP                    0x04
+#define TI_MCR_DTR                     0x10
+#define TI_MCR_RTS                     0x20
+
+/* Mask settings */
+#define TI_UART_ENABLE_RTS_IN          0x0001
+#define TI_UART_DISABLE_RTS            0x0002
+#define TI_UART_ENABLE_PARITY_CHECKING 0x0008
+#define TI_UART_ENABLE_DSR_OUT         0x0010
+#define TI_UART_ENABLE_CTS_OUT         0x0020
+#define TI_UART_ENABLE_X_OUT           0x0040
+#define TI_UART_ENABLE_XA_OUT          0x0080
+#define TI_UART_ENABLE_X_IN            0x0100
+#define TI_UART_ENABLE_DTR_IN          0x0800
+#define TI_UART_DISABLE_DTR            0x1000
+#define TI_UART_ENABLE_MS_INTS         0x2000
+#define TI_UART_ENABLE_AUTO_START_DMA  0x4000
+
+/* Parity */
+#define TI_UART_NO_PARITY              0x00
+#define TI_UART_ODD_PARITY             0x01
+#define TI_UART_EVEN_PARITY            0x02
+#define TI_UART_MARK_PARITY            0x03
+#define TI_UART_SPACE_PARITY           0x04
+
+/* Stop bits */
+#define TI_UART_1_STOP_BITS            0x00
+#define TI_UART_1_5_STOP_BITS          0x01
+#define TI_UART_2_STOP_BITS            0x02
+
+/* Bits per character */
+#define TI_UART_5_DATA_BITS            0x00
+#define TI_UART_6_DATA_BITS            0x01
+#define TI_UART_7_DATA_BITS            0x02
+#define TI_UART_8_DATA_BITS            0x03
+
+/* 232/485 modes */
+#define TI_UART_232                    0x00
+#define TI_UART_485_RECEIVER_DISABLED  0x01
+#define TI_UART_485_RECEIVER_ENABLED   0x02
+
+/* Pipe transfer mode and timeout */
+#define TI_PIPE_MODE_CONTINUOUS                0x01
+#define TI_PIPE_MODE_MASK              0x03
+#define TI_PIPE_TIMEOUT_MASK           0x7C
+#define TI_PIPE_TIMEOUT_ENABLE         0x80
+
+/* Config struct */
+struct ti_uart_config {
+       __u16   wBaudRate;
+       __u16   wFlags;
+       __u8    bDataBits;
+       __u8    bParity;
+       __u8    bStopBits;
+       char    cXon;
+       char    cXoff;
+       __u8    bUartMode;
+} __packed;
+
+/* Get port status */
+struct ti_port_status {
+       __u8    bCmdCode;
+       __u8    bModuleId;
+       __u8    bErrorCode;
+       __u8    bMSR;
+       __u8    bLSR;
+} __packed;
+
+/* Purge modes */
+#define TI_PURGE_OUTPUT                        0x00
+#define TI_PURGE_INPUT                 0x80
+
+/* Read/Write data */
+#define TI_RW_DATA_ADDR_SFR            0x10
+#define TI_RW_DATA_ADDR_IDATA          0x20
+#define TI_RW_DATA_ADDR_XDATA          0x30
+#define TI_RW_DATA_ADDR_CODE           0x40
+#define TI_RW_DATA_ADDR_GPIO           0x50
+#define TI_RW_DATA_ADDR_I2C            0x60
+#define TI_RW_DATA_ADDR_FLASH          0x70
+#define TI_RW_DATA_ADDR_DSP            0x80
+
+#define TI_RW_DATA_UNSPECIFIED         0x00
+#define TI_RW_DATA_BYTE                        0x01
+#define TI_RW_DATA_WORD                        0x02
+#define TI_RW_DATA_DOUBLE_WORD         0x04
+
+struct ti_write_data_bytes {
+       __u8    bAddrType;
+       __u8    bDataType;
+       __u8    bDataCounter;
+       __be16  wBaseAddrHi;
+       __be16  wBaseAddrLo;
+       __u8    bData[0];
+} __packed;
+
+struct ti_read_data_request {
+       __u8    bAddrType;
+       __u8    bDataType;
+       __u8    bDataCounter;
+       __be16  wBaseAddrHi;
+       __be16  wBaseAddrLo;
+} __packed;
+
+struct ti_read_data_bytes {
+       __u8    bCmdCode;
+       __u8    bModuleId;
+       __u8    bErrorCode;
+       __u8    bData[0];
+} __packed;
+
+/* Interrupt struct */
+struct ti_interrupt {
+       __u8    bICode;
+       __u8    bIInfo;
+} __packed;
+
+/* Interrupt codes */
+#define TI_CODE_HARDWARE_ERROR         0xFF
+#define TI_CODE_DATA_ERROR             0x03
+#define TI_CODE_MODEM_STATUS           0x04
+
+/* Download firmware max packet size */
+#define TI_DOWNLOAD_MAX_PACKET_SIZE    64
+
+/* Firmware image header */
+struct ti_firmware_header {
+       __le16  wLength;
+       __u8    bCheckSum;
+} __packed;
+
+/* UART addresses */
+#define TI_UART1_BASE_ADDR             0xFFA0  /* UART 1 base address */
+#define TI_UART2_BASE_ADDR             0xFFB0  /* UART 2 base address */
+#define TI_UART_OFFSET_LCR             0x0002  /* UART MCR register offset */
+#define TI_UART_OFFSET_MCR             0x0004  /* UART MCR register offset */
  
  #define TI_DRIVER_AUTHOR       "Al Borchers <alborchers@steinerpoint.com>"
  #define TI_DRIVER_DESC         "TI USB 3410/5052 Serial Driver"
@@ -58,9 +286,6 @@
  
  #define TI_EXTRA_VID_PID_COUNT 5
  
-
-/* Structures */
-
  struct ti_port {
         int                     tp_is_open;
         __u8                    tp_msr;
@@ -84,9 +309,6 @@ struct ti_device {
         int                     td_urb_error;
  };
  
-
-/* Function Declarations */
-
  static int ti_startup(struct usb_serial *serial);
  static void ti_release(struct usb_serial *serial);
  static int ti_port_probe(struct usb_serial_port *port);
@@ -136,13 +358,8 @@ static int ti_write_byte(struct usb_serial_port *port, struct ti_device *tdev,
  
  static int ti_download_firmware(struct ti_device *tdev);
  
-
-/* Data */
-
-/* module parameters */
  static int closing_wait = TI_DEFAULT_CLOSING_WAIT;
  
-/* supported devices */
  static const struct usb_device_id ti_id_table_3410[] = {
         { USB_DEVICE(TI_VENDOR_ID, TI_3410_PRODUCT_ID) },
         { USB_DEVICE(TI_VENDOR_ID, TI_3410_EZ430_ID) },
@@ -174,7 +391,7 @@ static const struct usb_device_id ti_id_table_5052[] = {
         { USB_DEVICE(TI_VENDOR_ID, TI_5152_BOOT_PRODUCT_ID) },
         { USB_DEVICE(TI_VENDOR_ID, TI_5052_EEPROM_PRODUCT_ID) },
         { USB_DEVICE(TI_VENDOR_ID, TI_5052_FIRMWARE_PRODUCT_ID) },
-       { }     /* terminator */
+       { }
  };
  
  static const struct usb_device_id ti_id_table_combined[] = {
@@ -275,8 +492,6 @@ static struct usb_serial_driver * const serial_drivers[] = {
         &ti_1port_device, &ti_2port_device, NULL
  };
  
-/* Module */
-
  MODULE_AUTHOR(TI_DRIVER_AUTHOR);
  MODULE_DESCRIPTION(TI_DRIVER_DESC);
  MODULE_LICENSE("GPL");
@@ -302,8 +517,6 @@ MODULE_DEVICE_TABLE(usb, ti_id_table_combined);
  
  module_usb_serial_driver(serial_drivers, ti_id_table_combined);
  
-/* Functions */
-
  static int ti_startup(struct usb_serial *serial)
  {
         struct ti_device *tdev;
@@ -319,7 +532,6 @@ static int ti_startup(struct usb_serial *serial)
                 dev->descriptor.bNumConfigurations,
                 dev->actconfig->desc.bConfigurationValue);
  
-       /* create device structure */
         tdev = kzalloc(sizeof(struct ti_device), GFP_KERNEL);
         if (!tdev)
                 return -ENOMEM;
@@ -435,7 +647,7 @@ static int ti_open(struct tty_struct *tty, struct usb_serial_port *port)
         struct urb *urb;
         int port_number;
         int status;
-       __u16 open_settings = (__u8)(TI_PIPE_MODE_CONTINOUS |
+       __u16 open_settings = (__u8)(TI_PIPE_MODE_CONTINUOUS |
                              TI_PIPE_TIMEOUT_ENABLE |
                              (TI_TRANSFER_TIMEOUT << 2));
  
@@ -954,6 +1166,15 @@ static void ti_break(struct tty_struct *tty, int break_state)
                 dev_dbg(&port->dev, "%s - error setting break, %d\n", __func__, status);
  }
  
+static int ti_get_port_from_code(unsigned char code)
+{
+       return (code >> 4) - 3;
+}
+
+static int ti_get_func_from_code(unsigned char code)
+{
+       return code & 0x0f;
+}
  
  static void ti_interrupt_callback(struct urb *urb)
  {
@@ -995,8 +1216,8 @@ static void ti_interrupt_callback(struct urb *urb)
                 goto exit;
         }
  
-       port_number = TI_GET_PORT_FROM_CODE(data[0]);
-       function = TI_GET_FUNC_FROM_CODE(data[0]);
+       port_number = ti_get_port_from_code(data[0]);
+       function = ti_get_func_from_code(data[0]);
  
         dev_dbg(dev, "%s - port_number %d, function %d, data 0x%02X\n",
                 __func__, port_number, function, data[1]);
diff --git a/drivers/usb/serial/ti_usb_3410_5052.h b/drivers/usb/serial/ti_usb_3410_5052.h

deleted file mode 100644 (file)

index bbfd3a1..0000000
--- a/drivers/usb/serial/ti_usb_3410_5052.h
+++ /dev/null
@@ -1,259 +0,0 @@
-/* vi: ts=8 sw=8
- *
- * TI 3410/5052 USB Serial Driver Header
- *
- * Copyright (C) 2004 Texas Instruments
- *
- * This driver is based on the Linux io_ti driver, which is
- *   Copyright (C) 2000-2002 Inside Out Networks
- *   Copyright (C) 2001-2002 Greg Kroah-Hartman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * For questions or problems with this driver, contact Texas Instruments
- * technical support, or Al Borchers <alborchers@steinerpoint.com>, or
- * Peter Berger <pberger@brimson.com>.
- */
-
-#ifndef _TI_3410_5052_H_
-#define _TI_3410_5052_H_
-
-/* Configuration ids */
-#define TI_BOOT_CONFIG                 1
-#define TI_ACTIVE_CONFIG               2
-
-/* Vendor and product ids */
-#define TI_VENDOR_ID                   0x0451
-#define IBM_VENDOR_ID                  0x04b3
-#define TI_3410_PRODUCT_ID             0x3410
-#define IBM_4543_PRODUCT_ID            0x4543
-#define IBM_454B_PRODUCT_ID            0x454b
-#define IBM_454C_PRODUCT_ID            0x454c
-#define TI_3410_EZ430_ID               0xF430  /* TI ez430 development tool */
-#define TI_5052_BOOT_PRODUCT_ID                0x5052  /* no EEPROM, no firmware */
-#define TI_5152_BOOT_PRODUCT_ID                0x5152  /* no EEPROM, no firmware */
-#define TI_5052_EEPROM_PRODUCT_ID      0x505A  /* EEPROM, no firmware */
-#define TI_5052_FIRMWARE_PRODUCT_ID    0x505F  /* firmware is running */
-#define FRI2_PRODUCT_ID                        0x5053  /* Fish River Island II */
-
-/* Multi-Tech vendor and product ids */
-#define MTS_VENDOR_ID                  0x06E0
-#define MTS_GSM_NO_FW_PRODUCT_ID       0xF108
-#define MTS_CDMA_NO_FW_PRODUCT_ID      0xF109
-#define MTS_CDMA_PRODUCT_ID            0xF110
-#define MTS_GSM_PRODUCT_ID             0xF111
-#define MTS_EDGE_PRODUCT_ID            0xF112
-#define MTS_MT9234MU_PRODUCT_ID                0xF114
-#define MTS_MT9234ZBA_PRODUCT_ID       0xF115
-#define MTS_MT9234ZBAOLD_PRODUCT_ID    0x0319
-
-/* Abbott Diabetics vendor and product ids */
-#define ABBOTT_VENDOR_ID               0x1a61
-#define ABBOTT_STEREO_PLUG_ID          0x3410
-#define ABBOTT_PRODUCT_ID              ABBOTT_STEREO_PLUG_ID
-#define ABBOTT_STRIP_PORT_ID           0x3420
-
-/* Honeywell vendor and product IDs */
-#define HONEYWELL_VENDOR_ID            0x10ac
-#define HONEYWELL_HGI80_PRODUCT_ID     0x0102  /* Honeywell HGI80 */
-
-/* Moxa UPORT 11x0 vendor and product IDs */
-#define MXU1_VENDOR_ID                         0x110a
-#define MXU1_1110_PRODUCT_ID                   0x1110
-#define MXU1_1130_PRODUCT_ID                   0x1130
-#define MXU1_1131_PRODUCT_ID                   0x1131
-#define MXU1_1150_PRODUCT_ID                   0x1150
-#define MXU1_1151_PRODUCT_ID                   0x1151
-
-/* Commands */
-#define TI_GET_VERSION                 0x01
-#define TI_GET_PORT_STATUS             0x02
-#define TI_GET_PORT_DEV_INFO           0x03
-#define TI_GET_CONFIG                  0x04
-#define TI_SET_CONFIG                  0x05
-#define TI_OPEN_PORT                   0x06
-#define TI_CLOSE_PORT                  0x07
-#define TI_START_PORT                  0x08
-#define TI_STOP_PORT                   0x09
-#define TI_TEST_PORT                   0x0A
-#define TI_PURGE_PORT                  0x0B
-#define TI_RESET_EXT_DEVICE            0x0C
-#define TI_WRITE_DATA                  0x80
-#define TI_READ_DATA                   0x81
-#define TI_REQ_TYPE_CLASS              0x82
-
-/* Module identifiers */
-#define TI_I2C_PORT                    0x01
-#define TI_IEEE1284_PORT               0x02
-#define TI_UART1_PORT                  0x03
-#define TI_UART2_PORT                  0x04
-#define TI_RAM_PORT                    0x05
-
-/* Modem status */
-#define TI_MSR_DELTA_CTS               0x01
-#define TI_MSR_DELTA_DSR               0x02
-#define TI_MSR_DELTA_RI                        0x04
-#define TI_MSR_DELTA_CD                        0x08
-#define TI_MSR_CTS                     0x10
-#define TI_MSR_DSR                     0x20
-#define TI_MSR_RI                      0x40
-#define TI_MSR_CD                      0x80
-#define TI_MSR_DELTA_MASK              0x0F
-#define TI_MSR_MASK                    0xF0
-
-/* Line status */
-#define TI_LSR_OVERRUN_ERROR           0x01
-#define TI_LSR_PARITY_ERROR            0x02
-#define TI_LSR_FRAMING_ERROR           0x04
-#define TI_LSR_BREAK                   0x08
-#define TI_LSR_ERROR                   0x0F
-#define TI_LSR_RX_FULL                 0x10
-#define TI_LSR_TX_EMPTY                        0x20
-
-/* Line control */
-#define TI_LCR_BREAK                   0x40
-
-/* Modem control */
-#define TI_MCR_LOOP                    0x04
-#define TI_MCR_DTR                     0x10
-#define TI_MCR_RTS                     0x20
-
-/* Mask settings */
-#define TI_UART_ENABLE_RTS_IN          0x0001
-#define TI_UART_DISABLE_RTS            0x0002
-#define TI_UART_ENABLE_PARITY_CHECKING 0x0008
-#define TI_UART_ENABLE_DSR_OUT         0x0010
-#define TI_UART_ENABLE_CTS_OUT         0x0020
-#define TI_UART_ENABLE_X_OUT           0x0040
-#define TI_UART_ENABLE_XA_OUT          0x0080
-#define TI_UART_ENABLE_X_IN            0x0100
-#define TI_UART_ENABLE_DTR_IN          0x0800
-#define TI_UART_DISABLE_DTR            0x1000
-#define TI_UART_ENABLE_MS_INTS         0x2000
-#define TI_UART_ENABLE_AUTO_START_DMA  0x4000
-
-/* Parity */
-#define TI_UART_NO_PARITY              0x00
-#define TI_UART_ODD_PARITY             0x01
-#define TI_UART_EVEN_PARITY            0x02
-#define TI_UART_MARK_PARITY            0x03
-#define TI_UART_SPACE_PARITY           0x04
-
-/* Stop bits */
-#define TI_UART_1_STOP_BITS            0x00
-#define TI_UART_1_5_STOP_BITS          0x01
-#define TI_UART_2_STOP_BITS            0x02
-
-/* Bits per character */
-#define TI_UART_5_DATA_BITS            0x00
-#define TI_UART_6_DATA_BITS            0x01
-#define TI_UART_7_DATA_BITS            0x02
-#define TI_UART_8_DATA_BITS            0x03
-
-/* 232/485 modes */
-#define TI_UART_232                    0x00
-#define TI_UART_485_RECEIVER_DISABLED  0x01
-#define TI_UART_485_RECEIVER_ENABLED   0x02
-
-/* Pipe transfer mode and timeout */
-#define TI_PIPE_MODE_CONTINOUS         0x01
-#define TI_PIPE_MODE_MASK              0x03
-#define TI_PIPE_TIMEOUT_MASK           0x7C
-#define TI_PIPE_TIMEOUT_ENABLE         0x80
-
-/* Config struct */
-struct ti_uart_config {
-       __u16   wBaudRate;
-       __u16   wFlags;
-       __u8    bDataBits;
-       __u8    bParity;
-       __u8    bStopBits;
-       char    cXon;
-       char    cXoff;
-       __u8    bUartMode;
-} __attribute__((packed));
-
-/* Get port status */
-struct ti_port_status {
-       __u8    bCmdCode;
-       __u8    bModuleId;
-       __u8    bErrorCode;
-       __u8    bMSR;
-       __u8    bLSR;
-} __attribute__((packed));
-
-/* Purge modes */
-#define TI_PURGE_OUTPUT                        0x00
-#define TI_PURGE_INPUT                 0x80
-
-/* Read/Write data */
-#define TI_RW_DATA_ADDR_SFR            0x10
-#define TI_RW_DATA_ADDR_IDATA          0x20
-#define TI_RW_DATA_ADDR_XDATA          0x30
-#define TI_RW_DATA_ADDR_CODE           0x40
-#define TI_RW_DATA_ADDR_GPIO           0x50
-#define TI_RW_DATA_ADDR_I2C            0x60
-#define TI_RW_DATA_ADDR_FLASH          0x70
-#define TI_RW_DATA_ADDR_DSP            0x80
-
-#define TI_RW_DATA_UNSPECIFIED         0x00
-#define TI_RW_DATA_BYTE                        0x01
-#define TI_RW_DATA_WORD                        0x02
-#define TI_RW_DATA_DOUBLE_WORD         0x04
-
-struct ti_write_data_bytes {
-       __u8    bAddrType;
-       __u8    bDataType;
-       __u8    bDataCounter;
-       __be16  wBaseAddrHi;
-       __be16  wBaseAddrLo;
-       __u8    bData[0];
-} __attribute__((packed));
-
-struct ti_read_data_request {
-       __u8    bAddrType;
-       __u8    bDataType;
-       __u8    bDataCounter;
-       __be16  wBaseAddrHi;
-       __be16  wBaseAddrLo;
-} __attribute__((packed));
-
-struct ti_read_data_bytes {
-       __u8    bCmdCode;
-       __u8    bModuleId;
-       __u8    bErrorCode;
-       __u8    bData[0];
-} __attribute__((packed));
-
-/* Interrupt struct */
-struct ti_interrupt {
-       __u8    bICode;
-       __u8    bIInfo;
-} __attribute__((packed));
-
-/* Interrupt codes */
-#define TI_GET_PORT_FROM_CODE(c)       (((c) >> 4) - 3)
-#define TI_GET_FUNC_FROM_CODE(c)       ((c) & 0x0f)
-#define TI_CODE_HARDWARE_ERROR         0xFF
-#define TI_CODE_DATA_ERROR             0x03
-#define TI_CODE_MODEM_STATUS           0x04
-
-/* Download firmware max packet size */
-#define TI_DOWNLOAD_MAX_PACKET_SIZE    64
-
-/* Firmware image header */
-struct ti_firmware_header {
-       __le16  wLength;
-       __u8    bCheckSum;
-} __attribute__((packed));
-
-/* UART addresses */
-#define TI_UART1_BASE_ADDR             0xFFA0  /* UART 1 base address */
-#define TI_UART2_BASE_ADDR             0xFFB0  /* UART 2 base address */
-#define TI_UART_OFFSET_LCR             0x0002  /* UART MCR register offset */
-#define TI_UART_OFFSET_MCR             0x0004  /* UART MCR register offset */
-
-#endif /* _TI_3410_5052_H_ */
diff --git a/fs/Kconfig b/fs/Kconfig

index 4524916fa200b1b4e9c8349ff94c78711595be8d..2bc7ad77584287870681fc940995b355322eadc2 100644 (file)
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -70,6 +70,12 @@ config FS_POSIX_ACL
  config EXPORTFS
         tristate
  
+config EXPORTFS_BLOCK_OPS
+       bool "Enable filesystem export operations for block IO"
+       help
+         This option enables the export operations for a filesystem to support
+         external block IO.
+
  config FILE_LOCKING
         bool "Enable POSIX file locking API" if EXPERT
         default y
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt

index 72c03354c14bf5e10c405f8d778d581f18c2e250..c7efddf6e0380988b63337cef8c0657e0df53ec8 100644 (file)
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -89,7 +89,8 @@ config BINFMT_SCRIPT
  
  config BINFMT_FLAT
         bool "Kernel support for flat binaries"
-       depends on !MMU && (!FRV || BROKEN)
+       depends on !MMU || M68K
+       depends on !FRV || BROKEN
         help
           Support uClinux FLAT format binaries.
  
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c

index 203589311bf88733bdb53d329a3b05a490ce9164..464a972e88c133a917493bd19a68e375c6dfd1d2 100644 (file)
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -67,8 +67,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *,
                                    struct elf_fdpic_params *);
  
  #ifndef CONFIG_MMU
-static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *,
-                                           unsigned long *);
  static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *,
                                                    struct file *,
                                                    struct mm_struct *);
@@ -515,8 +513,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
         sp = mm->start_stack;
  
         /* stack the program arguments and environment */
-       if (elf_fdpic_transfer_args_to_stack(bprm, &sp) < 0)
+       if (transfer_args_to_stack(bprm, &sp) < 0)
                 return -EFAULT;
+       sp &= ~15;
  #endif
  
         /*
@@ -709,39 +708,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
         return 0;
  }
  
-/*****************************************************************************/
-/*
- * transfer the program arguments and environment from the holding pages onto
- * the stack
- */
-#ifndef CONFIG_MMU
-static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm,
-                                           unsigned long *_sp)
-{
-       unsigned long index, stop, sp;
-       char *src;
-       int ret = 0;
-
-       stop = bprm->p >> PAGE_SHIFT;
-       sp = *_sp;
-
-       for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
-               src = kmap(bprm->page[index]);
-               sp -= PAGE_SIZE;
-               if (copy_to_user((void *) sp, src, PAGE_SIZE) != 0)
-                       ret = -EFAULT;
-               kunmap(bprm->page[index]);
-               if (ret < 0)
-                       goto out;
-       }
-
-       *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15;
-
-out:
-       return ret;
-}
-#endif
-
  /*****************************************************************************/
  /*
   * load the appropriate binary image (executable or interpreter) into memory
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c

index caf9e39bb82b7fbd0b2387b6dea5332cdc5699de..9b2917a3029406f18ce2a0a0bbf62fdef7bca192 100644 (file)
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -15,7 +15,8 @@
   *     JAN/99 -- coded full program relocation (gerg@snapgear.com)
   */
  
-#include <linux/export.h>
+#define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
+
  #include <linux/kernel.h>
  #include <linux/sched.h>
  #include <linux/mm.h>
@@ -25,8 +26,6 @@
  #include <linux/string.h>
  #include <linux/fs.h>
  #include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
  #include <linux/ptrace.h>
  #include <linux/user.h>
  #include <linux/slab.h>
@@ -34,26 +33,16 @@
  #include <linux/personality.h>
  #include <linux/init.h>
  #include <linux/flat.h>
-#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
  
  #include <asm/byteorder.h>
-#include <asm/uaccess.h>
  #include <asm/unaligned.h>
  #include <asm/cacheflush.h>
  #include <asm/page.h>
  
  /****************************************************************************/
  
-#if 0
-#define DEBUG 1
-#endif
-
-#ifdef DEBUG
-#define        DBG_FLT(a...)   printk(a)
-#else
-#define        DBG_FLT(a...)
-#endif
-
  /*
   * User data (data section and bss) needs to be aligned.
   * We pick 0x20 here because it is the max value elf2flt has always
@@ -80,7 +69,7 @@ struct lib_info {
                 unsigned long text_len;                 /* Length of text segment */
                 unsigned long entry;                    /* Start address for this module */
                 unsigned long build_date;               /* When this one was compiled */
-               short loaded;                           /* Has this library been loaded? */
+               bool loaded;                            /* Has this library been loaded? */
         } lib_list[MAX_SHARED_LIBS];
  };
  
@@ -106,59 +95,67 @@ static struct linux_binfmt flat_format = {
  
  static int flat_core_dump(struct coredump_params *cprm)
  {
-       printk("Process %s:%d received signr %d and should have core dumped\n",
-                       current->comm, current->pid, (int) cprm->siginfo->si_signo);
-       return(1);
+       pr_warn("Process %s:%d received signr %d and should have core dumped\n",
+               current->comm, current->pid, cprm->siginfo->si_signo);
+       return 1;
  }
  
  /****************************************************************************/
  /*
   * create_flat_tables() parses the env- and arg-strings in new user
   * memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
+ * addresses on the "stack", recording the new stack pointer value.
   */
  
-static unsigned long create_flat_tables(
-       unsigned long pp,
-       struct linux_binprm * bprm)
+static int create_flat_tables(struct linux_binprm *bprm, unsigned long arg_start)
  {
-       unsigned long *argv,*envp;
-       unsigned long * sp;
-       char * p = (char*)pp;
-       int argc = bprm->argc;
-       int envc = bprm->envc;
-       char uninitialized_var(dummy);
-
-       sp = (unsigned long *)p;
-       sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
-       sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
-       argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
-       envp = argv + (argc + 1);
+       char __user *p;
+       unsigned long __user *sp;
+       long i, len;
+
+       p = (char __user *)arg_start;
+       sp = (unsigned long __user *)current->mm->start_stack;
+
+       sp -= bprm->envc + 1;
+       sp -= bprm->argc + 1;
+       sp -= flat_argvp_envp_on_stack() ? 2 : 0;
+       sp -= 1;  /* &argc */
  
+       current->mm->start_stack = (unsigned long)sp & -FLAT_STACK_ALIGN;
+       sp = (unsigned long __user *)current->mm->start_stack;
+
+       __put_user(bprm->argc, sp++);
         if (flat_argvp_envp_on_stack()) {
-               put_user((unsigned long) envp, sp + 2);
-               put_user((unsigned long) argv, sp + 1);
-       }
-
-       put_user(argc, sp);
-       current->mm->arg_start = (unsigned long) p;
-       while (argc-->0) {
-               put_user((unsigned long) p, argv++);
-               do {
-                       get_user(dummy, p); p++;
-               } while (dummy);
-       }
-       put_user((unsigned long) NULL, argv);
-       current->mm->arg_end = current->mm->env_start = (unsigned long) p;
-       while (envc-->0) {
-               put_user((unsigned long)p, envp); envp++;
-               do {
-                       get_user(dummy, p); p++;
-               } while (dummy);
-       }
-       put_user((unsigned long) NULL, envp);
-       current->mm->env_end = (unsigned long) p;
-       return (unsigned long)sp;
+               unsigned long argv, envp;
+               argv = (unsigned long)(sp + 2);
+               envp = (unsigned long)(sp + 2 + bprm->argc + 1);
+               __put_user(argv, sp++);
+               __put_user(envp, sp++);
+       }
+
+       current->mm->arg_start = (unsigned long)p;
+       for (i = bprm->argc; i > 0; i--) {
+               __put_user((unsigned long)p, sp++);
+               len = strnlen_user(p, MAX_ARG_STRLEN);
+               if (!len || len > MAX_ARG_STRLEN)
+                       return -EINVAL;
+               p += len;
+       }
+       __put_user(0, sp++);
+       current->mm->arg_end = (unsigned long)p;
+
+       current->mm->env_start = (unsigned long) p;
+       for (i = bprm->envc; i > 0; i--) {
+               __put_user((unsigned long)p, sp++);
+               len = strnlen_user(p, MAX_ARG_STRLEN);
+               if (!len || len > MAX_ARG_STRLEN)
+                       return -EINVAL;
+               p += len;
+       }
+       __put_user(0, sp++);
+       current->mm->env_end = (unsigned long)p;
+
+       return 0;
  }
  
  /****************************************************************************/
@@ -190,17 +187,17 @@ static int decompress_exec(
         loff_t fpos;
         int ret, retval;
  
-       DBG_FLT("decompress_exec(offset=%x,buf=%x,len=%x)\n",(int)offset, (int)dst, (int)len);
+       pr_debug("decompress_exec(offset=%lx,buf=%p,len=%lx)\n", offset, dst, len);
  
         memset(&strm, 0, sizeof(strm));
         strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
         if (strm.workspace == NULL) {
-               DBG_FLT("binfmt_flat: no memory for decompress workspace\n");
+               pr_debug("no memory for decompress workspace\n");
                 return -ENOMEM;
         }
         buf = kmalloc(LBUFSIZE, GFP_KERNEL);
         if (buf == NULL) {
-               DBG_FLT("binfmt_flat: no memory for read buffer\n");
+               pr_debug("no memory for read buffer\n");
                 retval = -ENOMEM;
                 goto out_free;
         }
@@ -218,49 +215,49 @@ static int decompress_exec(
  
         /* Check minimum size -- gzip header */
         if (ret < 10) {
-               DBG_FLT("binfmt_flat: file too small?\n");
+               pr_debug("file too small?\n");
                 goto out_free_buf;
         }
  
         /* Check gzip magic number */
         if ((buf[0] != 037) || ((buf[1] != 0213) && (buf[1] != 0236))) {
-               DBG_FLT("binfmt_flat: unknown compression magic?\n");
+               pr_debug("unknown compression magic?\n");
                 goto out_free_buf;
         }
  
         /* Check gzip method */
         if (buf[2] != 8) {
-               DBG_FLT("binfmt_flat: unknown compression method?\n");
+               pr_debug("unknown compression method?\n");
                 goto out_free_buf;
         }
         /* Check gzip flags */
         if ((buf[3] & ENCRYPTED) || (buf[3] & CONTINUATION) ||
             (buf[3] & RESERVED)) {
-               DBG_FLT("binfmt_flat: unknown flags?\n");
+               pr_debug("unknown flags?\n");
                 goto out_free_buf;
         }
  
         ret = 10;
         if (buf[3] & EXTRA_FIELD) {
                 ret += 2 + buf[10] + (buf[11] << 8);
-               if (unlikely(LBUFSIZE <= ret)) {
-                       DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n");
+               if (unlikely(ret >= LBUFSIZE)) {
+                       pr_debug("buffer overflow (EXTRA)?\n");
                         goto out_free_buf;
                 }
         }
         if (buf[3] & ORIG_NAME) {
                 while (ret < LBUFSIZE && buf[ret++] != 0)
                         ;
-               if (unlikely(LBUFSIZE == ret)) {
-                       DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n");
+               if (unlikely(ret == LBUFSIZE)) {
+                       pr_debug("buffer overflow (ORIG_NAME)?\n");
                         goto out_free_buf;
                 }
         }
         if (buf[3] & COMMENT) {
                 while (ret < LBUFSIZE && buf[ret++] != 0)
                         ;
-               if (unlikely(LBUFSIZE == ret)) {
-                       DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n");
+               if (unlikely(ret == LBUFSIZE)) {
+                       pr_debug("buffer overflow (COMMENT)?\n");
                         goto out_free_buf;
                 }
         }
@@ -273,7 +270,7 @@ static int decompress_exec(
         strm.total_out = 0;
  
         if (zlib_inflateInit2(&strm, -MAX_WBITS) != Z_OK) {
-               DBG_FLT("binfmt_flat: zlib init failed?\n");
+               pr_debug("zlib init failed?\n");
                 goto out_free_buf;
         }
  
@@ -290,7 +287,7 @@ static int decompress_exec(
         }
  
         if (ret < 0) {
-               DBG_FLT("binfmt_flat: decompression failed (%d), %s\n",
+               pr_debug("decompression failed (%d), %s\n",
                         ret, strm.msg);
                 goto out_zlib;
         }
@@ -327,24 +324,23 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
                 r &= 0x00ffffff;        /* Trim ID off here */
         }
         if (id >= MAX_SHARED_LIBS) {
-               printk("BINFMT_FLAT: reference 0x%x to shared library %d",
-                               (unsigned) r, id);
+               pr_err("reference 0x%lx to shared library %d", r, id);
                 goto failed;
         }
         if (curid != id) {
                 if (internalp) {
-                       printk("BINFMT_FLAT: reloc address 0x%x not in same module "
-                                       "(%d != %d)", (unsigned) r, curid, id);
+                       pr_err("reloc address 0x%lx not in same module "
+                              "(%d != %d)", r, curid, id);
                         goto failed;
-               } else if ( ! p->lib_list[id].loaded &&
-                               load_flat_shared_library(id, p) < 0) {
-                       printk("BINFMT_FLAT: failed to load library %d", id);
+               } else if (!p->lib_list[id].loaded &&
+                          load_flat_shared_library(id, p) < 0) {
+                       pr_err("failed to load library %d", id);
                         goto failed;
                 }
                 /* Check versioning information (i.e. time stamps) */
                 if (p->lib_list[id].build_date && p->lib_list[curid].build_date &&
                                 p->lib_list[curid].build_date < p->lib_list[id].build_date) {
-                       printk("BINFMT_FLAT: library %d is younger than %d", id, curid);
+                       pr_err("library %d is younger than %d", id, curid);
                         goto failed;
                 }
         }
@@ -358,8 +354,8 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
         text_len = p->lib_list[id].text_len;
  
         if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
-               printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
-                      (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
+               pr_err("reloc outside program 0x%lx (0 - 0x%lx/0x%lx)",
+                      r, start_brk-start_data+text_len, text_len);
                 goto failed;
         }
  
@@ -369,10 +365,10 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
                 addr = r - text_len + start_data;
  
         /* Range checked already above so doing the range tests is redundant...*/
-       return(addr);
+       return addr;
  
  failed:
-       printk(", killing %s!\n", current->comm);
+       pr_cont(", killing %s!\n", current->comm);
         send_sig(SIGSEGV, current, 0);
  
         return RELOC_FAILED;
@@ -382,62 +378,57 @@ failed:
  
  static void old_reloc(unsigned long rl)
  {
-#ifdef DEBUG
-       char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" };
-#endif
+       static const char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" };
         flat_v2_reloc_t r;
-       unsigned long *ptr;
-       
+       unsigned long __user *ptr;
+       unsigned long val;
+
         r.value = rl;
  #if defined(CONFIG_COLDFIRE)
-       ptr = (unsigned long *) (current->mm->start_code + r.reloc.offset);
+       ptr = (unsigned long __user *)(current->mm->start_code + r.reloc.offset);
  #else
-       ptr = (unsigned long *) (current->mm->start_data + r.reloc.offset);
+       ptr = (unsigned long __user *)(current->mm->start_data + r.reloc.offset);
  #endif
+       get_user(val, ptr);
+
+       pr_debug("Relocation of variable at DATASEG+%x "
+                "(address %p, currently %lx) into segment %s\n",
+                r.reloc.offset, ptr, val, segment[r.reloc.type]);
  
-#ifdef DEBUG
-       printk("Relocation of variable at DATASEG+%x "
-               "(address %p, currently %x) into segment %s\n",
-               r.reloc.offset, ptr, (int)*ptr, segment[r.reloc.type]);
-#endif
-       
         switch (r.reloc.type) {
         case OLD_FLAT_RELOC_TYPE_TEXT:
-               *ptr += current->mm->start_code;
+               val += current->mm->start_code;
                 break;
         case OLD_FLAT_RELOC_TYPE_DATA:
-               *ptr += current->mm->start_data;
+               val += current->mm->start_data;
                 break;
         case OLD_FLAT_RELOC_TYPE_BSS:
-               *ptr += current->mm->end_data;
+               val += current->mm->end_data;
                 break;
         default:
-               printk("BINFMT_FLAT: Unknown relocation type=%x\n", r.reloc.type);
+               pr_err("Unknown relocation type=%x\n", r.reloc.type);
                 break;
         }
+       put_user(val, ptr);
  
-#ifdef DEBUG
-       printk("Relocation became %x\n", (int)*ptr);
-#endif
-}              
+       pr_debug("Relocation became %lx\n", val);
+}
  
  /****************************************************************************/
  
-static int load_flat_file(struct linux_binprm * bprm,
+static int load_flat_file(struct linux_binprm *bprm,
                 struct lib_info *libinfo, int id, unsigned long *extra_stack)
  {
-       struct flat_hdr * hdr;
-       unsigned long textpos = 0, datapos = 0, result;
-       unsigned long realdatastart = 0;
-       unsigned long text_len, data_len, bss_len, stack_len, flags;
-       unsigned long full_data;
-       unsigned long len, memp = 0;
-       unsigned long memp_size, extra, rlim;
-       unsigned long *reloc = 0, *rp;
+       struct flat_hdr *hdr;
+       unsigned long textpos, datapos, realdatastart;
+       unsigned long text_len, data_len, bss_len, stack_len, full_data, flags;
+       unsigned long len, memp, memp_size, extra, rlim;
+       unsigned long __user *reloc, *rp;
         struct inode *inode;
-       int i, rev, relocs = 0;
+       int i, rev, relocs;
         loff_t fpos;
         unsigned long start_code, end_code;
+       ssize_t result;
         int ret;
  
         hdr = ((struct flat_hdr *) bprm->buf);          /* exec-header */
@@ -469,20 +460,30 @@ static int load_flat_file(struct linux_binprm * bprm,
         }
  
         if (flags & FLAT_FLAG_KTRACE)
-               printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename);
+               pr_info("Loading file: %s\n", bprm->filename);
  
         if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) {
-               printk("BINFMT_FLAT: bad flat file version 0x%x (supported "
-                       "0x%lx and 0x%lx)\n",
-                       rev, FLAT_VERSION, OLD_FLAT_VERSION);
+               pr_err("bad flat file version 0x%x (supported 0x%lx and 0x%lx)\n",
+                      rev, FLAT_VERSION, OLD_FLAT_VERSION);
                 ret = -ENOEXEC;
                 goto err;
         }
-       
+
         /* Don't allow old format executables to use shared libraries */
         if (rev == OLD_FLAT_VERSION && id != 0) {
-               printk("BINFMT_FLAT: shared libraries are not available before rev 0x%x\n",
-                               (int) FLAT_VERSION);
+               pr_err("shared libraries are not available before rev 0x%lx\n",
+                      FLAT_VERSION);
+               ret = -ENOEXEC;
+               goto err;
+       }
+
+       /*
+        * Make sure the header params are sane.
+        * 28 bits (256 MB) is way more than reasonable in this case.
+        * If some top bits are set we have probable binary corruption.
+       */
+       if ((text_len | data_len | bss_len | stack_len | full_data) >> 28) {
+               pr_err("bad header\n");
                 ret = -ENOEXEC;
                 goto err;
         }
@@ -496,7 +497,7 @@ static int load_flat_file(struct linux_binprm * bprm,
  
  #ifndef CONFIG_BINFMT_ZFLAT
         if (flags & (FLAT_FLAG_GZIP|FLAT_FLAG_GZDATA)) {
-               printk("Support for ZFLAT executables is not enabled.\n");
+               pr_err("Support for ZFLAT executables is not enabled.\n");
                 ret = -ENOEXEC;
                 goto err;
         }
@@ -517,11 +518,9 @@ static int load_flat_file(struct linux_binprm * bprm,
  
         /* Flush all traces of the currently running executable */
         if (id == 0) {
-               result = flush_old_exec(bprm);
-               if (result) {
-                       ret = result;
+               ret = flush_old_exec(bprm);
+               if (ret)
                         goto err;
-               }
  
                 /* OK, This is the point of no return */
                 set_personality(PER_LINUX_32BIT);
@@ -539,48 +538,48 @@ static int load_flat_file(struct linux_binprm * bprm,
          * case,  and then the fully copied to RAM case which lumps
          * it all together.
          */
-       if ((flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP)) == 0) {
+       if (!IS_ENABLED(CONFIG_MMU) && !(flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) {
                 /*
                  * this should give us a ROM ptr,  but if it doesn't we don't
                  * really care
                  */
-               DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n");
+               pr_debug("ROM mapping of file (we hope)\n");
  
                 textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
                                   MAP_PRIVATE|MAP_EXECUTABLE, 0);
                 if (!textpos || IS_ERR_VALUE(textpos)) {
-                       if (!textpos)
-                               textpos = (unsigned long) -ENOMEM;
-                       printk("Unable to mmap process text, errno %d\n", (int)-textpos);
                         ret = textpos;
+                       if (!textpos)
+                               ret = -ENOMEM;
+                       pr_err("Unable to mmap process text, errno %d\n", ret);
                         goto err;
                 }
  
                 len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
                 len = PAGE_ALIGN(len);
-               realdatastart = vm_mmap(0, 0, len,
+               realdatastart = vm_mmap(NULL, 0, len,
                         PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
  
                 if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
+                       ret = realdatastart;
                         if (!realdatastart)
-                               realdatastart = (unsigned long) -ENOMEM;
-                       printk("Unable to allocate RAM for process data, errno %d\n",
-                                       (int)-realdatastart);
+                               ret = -ENOMEM;
+                       pr_err("Unable to allocate RAM for process data, "
+                              "errno %d\n", ret);
                         vm_munmap(textpos, text_len);
-                       ret = realdatastart;
                         goto err;
                 }
                 datapos = ALIGN(realdatastart +
                                 MAX_SHARED_LIBS * sizeof(unsigned long),
                                 FLAT_DATA_ALIGN);
  
-               DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n",
-                               (int)(data_len + bss_len + stack_len), (int)datapos);
+               pr_debug("Allocated data+bss+stack (%ld bytes): %lx\n",
+                        data_len + bss_len + stack_len, datapos);
  
                 fpos = ntohl(hdr->data_start);
  #ifdef CONFIG_BINFMT_ZFLAT
                 if (flags & FLAT_FLAG_GZDATA) {
-                       result = decompress_exec(bprm, fpos, (char *) datapos, 
+                       result = decompress_exec(bprm, fpos, (char *)datapos,
                                                  full_data, 0);
                 } else
  #endif
@@ -589,29 +588,30 @@ static int load_flat_file(struct linux_binprm * bprm,
                                         full_data);
                 }
                 if (IS_ERR_VALUE(result)) {
-                       printk("Unable to read data+bss, errno %d\n", (int)-result);
+                       ret = result;
+                       pr_err("Unable to read data+bss, errno %d\n", ret);
                         vm_munmap(textpos, text_len);
                         vm_munmap(realdatastart, len);
-                       ret = result;
                         goto err;
                 }
  
-               reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len));
+               reloc = (unsigned long __user *)
+                       (datapos + (ntohl(hdr->reloc_start) - text_len));
                 memp = realdatastart;
                 memp_size = len;
         } else {
  
                 len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
                 len = PAGE_ALIGN(len);
-               textpos = vm_mmap(0, 0, len,
+               textpos = vm_mmap(NULL, 0, len,
                         PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
  
                 if (!textpos || IS_ERR_VALUE(textpos)) {
-                       if (!textpos)
-                               textpos = (unsigned long) -ENOMEM;
-                       printk("Unable to allocate RAM for process text/data, errno %d\n",
-                                       (int)-textpos);
                         ret = textpos;
+                       if (!textpos)
+                               ret = -ENOMEM;
+                       pr_err("Unable to allocate RAM for process text/data, "
+                              "errno %d\n", ret);
                         goto err;
                 }
  
@@ -620,7 +620,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                                 MAX_SHARED_LIBS * sizeof(unsigned long),
                                 FLAT_DATA_ALIGN);
  
-               reloc = (unsigned long *)
+               reloc = (unsigned long __user *)
                         (datapos + (ntohl(hdr->reloc_start) - text_len));
                 memp = textpos;
                 memp_size = len;
@@ -629,21 +629,59 @@ static int load_flat_file(struct linux_binprm * bprm,
                  * load it all in and treat it like a RAM load from now on
                  */
                 if (flags & FLAT_FLAG_GZIP) {
-                       result = decompress_exec(bprm, sizeof (struct flat_hdr),
-                                        (((char *) textpos) + sizeof (struct flat_hdr)),
+#ifndef CONFIG_MMU
+                       result = decompress_exec(bprm, sizeof(struct flat_hdr),
+                                        (((char *)textpos) + sizeof(struct flat_hdr)),
                                          (text_len + full_data
-                                                 - sizeof (struct flat_hdr)),
+                                                 - sizeof(struct flat_hdr)),
                                          0);
                         memmove((void *) datapos, (void *) realdatastart,
                                         full_data);
+#else
+                       /*
+                        * This is used on MMU systems mainly for testing.
+                        * Let's use a kernel buffer to simplify things.
+                        */
+                       long unz_text_len = text_len - sizeof(struct flat_hdr);
+                       long unz_len = unz_text_len + full_data;
+                       char *unz_data = vmalloc(unz_len);
+                       if (!unz_data) {
+                               result = -ENOMEM;
+                       } else {
+                               result = decompress_exec(bprm, sizeof(struct flat_hdr),
+                                                        unz_data, unz_len, 0);
+                               if (result == 0 &&
+                                   (copy_to_user((void __user *)textpos + sizeof(struct flat_hdr),
+                                                 unz_data, unz_text_len) ||
+                                    copy_to_user((void __user *)datapos,
+                                                 unz_data + unz_text_len, full_data)))
+                                       result = -EFAULT;
+                               vfree(unz_data);
+                       }
+#endif
                 } else if (flags & FLAT_FLAG_GZDATA) {
                         result = read_code(bprm->file, textpos, 0, text_len);
-                       if (!IS_ERR_VALUE(result))
+                       if (!IS_ERR_VALUE(result)) {
+#ifndef CONFIG_MMU
                                 result = decompress_exec(bprm, text_len, (char *) datapos,
                                                  full_data, 0);
-               }
-               else
+#else
+                               char *unz_data = vmalloc(full_data);
+                               if (!unz_data) {
+                                       result = -ENOMEM;
+                               } else {
+                                       result = decompress_exec(bprm, text_len,
+                                                      unz_data, full_data, 0);
+                                       if (result == 0 &&
+                                           copy_to_user((void __user *)datapos,
+                                                        unz_data, full_data))
+                                               result = -EFAULT;
+                                       vfree(unz_data);
+                               }
  #endif
+                       }
+               } else
+#endif /* CONFIG_BINFMT_ZFLAT */
                 {
                         result = read_code(bprm->file, textpos, 0, text_len);
                         if (!IS_ERR_VALUE(result))
@@ -652,21 +690,19 @@ static int load_flat_file(struct linux_binprm * bprm,
                                                    full_data);
                 }
                 if (IS_ERR_VALUE(result)) {
-                       printk("Unable to read code+data+bss, errno %d\n",(int)-result);
+                       ret = result;
+                       pr_err("Unable to read code+data+bss, errno %d\n", ret);
                         vm_munmap(textpos, text_len + data_len + extra +
                                 MAX_SHARED_LIBS * sizeof(unsigned long));
-                       ret = result;
                         goto err;
                 }
         }
  
-       if (flags & FLAT_FLAG_KTRACE)
-               printk("Mapping is %x, Entry point is %x, data_start is %x\n",
-                       (int)textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start));
+       start_code = textpos + sizeof(struct flat_hdr);
+       end_code = textpos + text_len;
+       text_len -= sizeof(struct flat_hdr); /* the real code len */
  
         /* The main program needs a little extra setup in the task structure */
-       start_code = textpos + sizeof (struct flat_hdr);
-       end_code = textpos + text_len;
         if (id == 0) {
                 current->mm->start_code = start_code;
                 current->mm->end_code = end_code;
@@ -681,19 +717,19 @@ static int load_flat_file(struct linux_binprm * bprm,
                  */
                 current->mm->start_brk = datapos + data_len + bss_len;
                 current->mm->brk = (current->mm->start_brk + 3) & ~3;
+#ifndef CONFIG_MMU
                 current->mm->context.end_brk = memp + memp_size - stack_len;
+#endif
         }
  
-       if (flags & FLAT_FLAG_KTRACE)
-               printk("%s %s: TEXT=%x-%x DATA=%x-%x BSS=%x-%x\n",
+       if (flags & FLAT_FLAG_KTRACE) {
+               pr_info("Mapping is %lx, Entry point is %x, data_start is %x\n",
+                       textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start));
+               pr_info("%s %s: TEXT=%lx-%lx DATA=%lx-%lx BSS=%lx-%lx\n",
                         id ? "Lib" : "Load", bprm->filename,
-                       (int) start_code, (int) end_code,
-                       (int) datapos,
-                       (int) (datapos + data_len),
-                       (int) (datapos + data_len),
-                       (int) (((datapos + data_len + bss_len) + 3) & ~3));
-
-       text_len -= sizeof(struct flat_hdr); /* the real code len */
+                       start_code, end_code, datapos, datapos + data_len,
+                       datapos + data_len, (datapos + data_len + bss_len + 3) & ~3);
+       }
  
         /* Store the current module values into the global library structure */
         libinfo->lib_list[id].start_code = start_code;
@@ -703,7 +739,7 @@ static int load_flat_file(struct linux_binprm * bprm,
         libinfo->lib_list[id].loaded = 1;
         libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos;
         libinfo->lib_list[id].build_date = ntohl(hdr->build_date);
-       
+
         /*
          * We just load the allocations into some temporary memory to
          * help simplify all this mumbo jumbo
@@ -717,15 +753,20 @@ static int load_flat_file(struct linux_binprm * bprm,
          * image.
          */
         if (flags & FLAT_FLAG_GOTPIC) {
-               for (rp = (unsigned long *)datapos; *rp != 0xffffffff; rp++) {
-                       unsigned long addr;
-                       if (*rp) {
-                               addr = calc_reloc(*rp, libinfo, id, 0);
+               for (rp = (unsigned long __user *)datapos; ; rp++) {
+                       unsigned long addr, rp_val;
+                       if (get_user(rp_val, rp))
+                               return -EFAULT;
+                       if (rp_val == 0xffffffff)
+                               break;
+                       if (rp_val) {
+                               addr = calc_reloc(rp_val, libinfo, id, 0);
                                 if (addr == RELOC_FAILED) {
                                         ret = -ENOEXEC;
                                         goto err;
                                 }
-                               *rp = addr;
+                               if (put_user(addr, rp))
+                                       return -EFAULT;
                         }
                 }
         }
@@ -742,19 +783,23 @@ static int load_flat_file(struct linux_binprm * bprm,
          * __start to address 4 so that is okay).
          */
         if (rev > OLD_FLAT_VERSION) {
-               unsigned long persistent = 0;
-               for (i=0; i < relocs; i++) {
+               unsigned long __maybe_unused persistent = 0;
+               for (i = 0; i < relocs; i++) {
                         unsigned long addr, relval;
  
-                       /* Get the address of the pointer to be
-                          relocated (of course, the address has to be
-                          relocated first).  */
-                       relval = ntohl(reloc[i]);
-                       if (flat_set_persistent (relval, &persistent))
+                       /*
+                        * Get the address of the pointer to be
+                        * relocated (of course, the address has to be
+                        * relocated first).
+                        */
+                       if (get_user(relval, reloc + i))
+                               return -EFAULT;
+                       relval = ntohl(relval);
+                       if (flat_set_persistent(relval, &persistent))
                                 continue;
                         addr = flat_get_relocate_addr(relval);
-                       rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1);
-                       if (rp == (unsigned long *)RELOC_FAILED) {
+                       rp = (unsigned long __user *)calc_reloc(addr, libinfo, id, 1);
+                       if (rp == (unsigned long __user *)RELOC_FAILED) {
                                 ret = -ENOEXEC;
                                 goto err;
                         }
@@ -780,17 +825,23 @@ static int load_flat_file(struct linux_binprm * bprm,
                         }
                 }
         } else {
-               for (i=0; i < relocs; i++)
-                       old_reloc(ntohl(reloc[i]));
+               for (i = 0; i < relocs; i++) {
+                       unsigned long relval;
+                       if (get_user(relval, reloc + i))
+                               return -EFAULT;
+                       relval = ntohl(relval);
+                       old_reloc(relval);
+               }
         }
-       
+
         flush_icache_range(start_code, end_code);
  
         /* zero the BSS,  BRK and stack areas */
-       memset((void*)(datapos + data_len), 0, bss_len + 
-                       (memp + memp_size - stack_len -         /* end brk */
-                       libinfo->lib_list[id].start_brk) +      /* start brk */
-                       stack_len);
+       if (clear_user((void __user *)(datapos + data_len), bss_len +
+                      (memp + memp_size - stack_len -          /* end brk */
+                      libinfo->lib_list[id].start_brk) +       /* start brk */
+                      stack_len))
+               return -EFAULT;
  
         return 0;
  err:
@@ -846,7 +897,7 @@ out:
         allow_write_access(bprm.file);
         fput(bprm.file);
  
-       return(res);
+       return res;
  }
  
  #endif /* CONFIG_BINFMT_SHARED_FLAT */
@@ -857,18 +908,17 @@ out:
   * libraries.  There is no binary dependent code anywhere else.
   */
  
-static int load_flat_binary(struct linux_binprm * bprm)
+static int load_flat_binary(struct linux_binprm *bprm)
  {
         struct lib_info libinfo;
         struct pt_regs *regs = current_pt_regs();
-       unsigned long p = bprm->p;
-       unsigned long stack_len;
+       unsigned long stack_len = 0;
         unsigned long start_addr;
-       unsigned long *sp;
         int res;
         int i, j;
  
         memset(&libinfo, 0, sizeof(libinfo));
+
         /*
          * We have to add the size of our arguments to our stack size
          * otherwise it's too easy for users to create stack overflows
@@ -876,38 +926,54 @@ static int load_flat_binary(struct linux_binprm * bprm)
          * pedantic and include space for the argv/envp array as it may have
          * a lot of entries.
          */
-#define TOP_OF_ARGS (PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *))
-       stack_len = TOP_OF_ARGS - bprm->p;             /* the strings */
-       stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
-       stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
-       stack_len += FLAT_STACK_ALIGN - 1;  /* reserve for upcoming alignment */
-       
+#ifndef CONFIG_MMU
+       stack_len += PAGE_SIZE * MAX_ARG_PAGES - bprm->p; /* the strings */
+#endif
+       stack_len += (bprm->argc + 1) * sizeof(char *);   /* the argv array */
+       stack_len += (bprm->envc + 1) * sizeof(char *);   /* the envp array */
+       stack_len = ALIGN(stack_len, FLAT_STACK_ALIGN);
+
         res = load_flat_file(bprm, &libinfo, 0, &stack_len);
         if (res < 0)
                 return res;
-       
+
         /* Update data segment pointers for all libraries */
-       for (i=0; i<MAX_SHARED_LIBS; i++)
-               if (libinfo.lib_list[i].loaded)
-                       for (j=0; j<MAX_SHARED_LIBS; j++)
-                               (-(j+1))[(unsigned long *)(libinfo.lib_list[i].start_data)] =
-                                       (libinfo.lib_list[j].loaded)?
-                                               libinfo.lib_list[j].start_data:UNLOADED_LIB;
+       for (i = 0; i < MAX_SHARED_LIBS; i++) {
+               if (!libinfo.lib_list[i].loaded)
+                       continue;
+               for (j = 0; j < MAX_SHARED_LIBS; j++) {
+                       unsigned long val = libinfo.lib_list[j].loaded ?
+                               libinfo.lib_list[j].start_data : UNLOADED_LIB;
+                       unsigned long __user *p = (unsigned long __user *)
+                               libinfo.lib_list[i].start_data;
+                       p -= j + 1;
+                       if (put_user(val, p))
+                               return -EFAULT;
+               }
+       }
  
         install_exec_creds(bprm);
  
         set_binfmt(&flat_format);
  
-       p = ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4;
-       DBG_FLT("p=%x\n", (int)p);
+#ifdef CONFIG_MMU
+       res = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
+       if (!res)
+               res = create_flat_tables(bprm, bprm->p);
+#else
+       /* Stash our initial stack pointer into the mm structure */
+       current->mm->start_stack =
+               ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4;
+       pr_debug("sp=%lx\n", current->mm->start_stack);
  
-       /* copy the arg pages onto the stack, this could be more efficient :-) */
-       for (i = TOP_OF_ARGS - 1; i >= bprm->p; i--)
-               * (char *) --p =
-                       ((char *) page_address(bprm->page[i/PAGE_SIZE]))[i % PAGE_SIZE];
+       /* copy the arg pages onto the stack */
+       res = transfer_args_to_stack(bprm, &current->mm->start_stack);
+       if (!res)
+               res = create_flat_tables(bprm, current->mm->start_stack);
+#endif
+       if (res)
+               return res;
  
-       sp = (unsigned long *) create_flat_tables(p, bprm);
-       
         /* Fake some return addresses to ensure the call chain will
          * initialise library in order for us.  We are required to call
          * lib 1 first, then 2, ... and finally the main program (id 0).
@@ -915,24 +981,24 @@ static int load_flat_binary(struct linux_binprm * bprm)
         start_addr = libinfo.lib_list[0].entry;
  
  #ifdef CONFIG_BINFMT_SHARED_FLAT
-       for (i = MAX_SHARED_LIBS-1; i>0; i--) {
+       for (i = MAX_SHARED_LIBS-1; i > 0; i--) {
                 if (libinfo.lib_list[i].loaded) {
                         /* Push previos first to call address */
-                       --sp;   put_user(start_addr, sp);
+                       unsigned long __user *sp;
+                       current->mm->start_stack -= sizeof(unsigned long);
+                       sp = (unsigned long __user *)current->mm->start_stack;
+                       __put_user(start_addr, sp);
                         start_addr = libinfo.lib_list[i].entry;
                 }
         }
  #endif
-       
-       /* Stash our initial stack pointer into the mm structure */
-       current->mm->start_stack = (unsigned long )sp;
  
  #ifdef FLAT_PLAT_INIT
         FLAT_PLAT_INIT(regs);
  #endif
-       DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n",
-               (int)regs, (int)start_addr, (int)current->mm->start_stack);
-       
+
+       pr_debug("start_thread(regs=0x%p, entry=0x%lx, start_stack=0x%lx)\n",
+                regs, start_addr, current->mm->start_stack);
         start_thread(regs, start_addr, current->mm->start_stack);
  
         return 0;
@@ -945,9 +1011,6 @@ static int __init init_flat_binfmt(void)
         register_binfmt(&flat_format);
         return 0;
  }
-
-/****************************************************************************/
-
  core_initcall(init_flat_binfmt);
  
  /****************************************************************************/
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c

index 67a607709d4f7eb18d941cf1a70c7d00a7afba66..53bb7af4e5f06cfb1b0a38e167b7cc880fd8aefe 100644 (file)
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -55,8 +55,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
         }
         if (size > 0) {
                 acl = posix_acl_from_xattr(&init_user_ns, value, size);
-       } else if (size == -ENOENT || size == -ENODATA || size == 0) {
-               /* FIXME, who returns -ENOENT?  I think nobody */
+       } else if (size == -ERANGE || size == -ENODATA || size == 0) {
                 acl = NULL;
         } else {
                 acl = ERR_PTR(-EIO);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c

index 5fb60ea7eee2b2c28e067d11b54c2fad7127c7d8..e0f071f6b5a761faa6a00b741017a759c91e100a 100644 (file)
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -34,6 +34,10 @@
  
  struct __btrfs_workqueue {
         struct workqueue_struct *normal_wq;
+
+       /* File system this workqueue services */
+       struct btrfs_fs_info *fs_info;
+
         /* List head pointing to ordered work list */
         struct list_head ordered_list;
  
@@ -70,6 +74,18 @@ void btrfs_##name(struct work_struct *arg)                           \
         normal_work_helper(work);                                       \
  }
  
+struct btrfs_fs_info *
+btrfs_workqueue_owner(struct __btrfs_workqueue *wq)
+{
+       return wq->fs_info;
+}
+
+struct btrfs_fs_info *
+btrfs_work_owner(struct btrfs_work *work)
+{
+       return work->wq->fs_info;
+}
+
  BTRFS_WORK_HELPER(worker_helper);
  BTRFS_WORK_HELPER(delalloc_helper);
  BTRFS_WORK_HELPER(flush_delalloc_helper);
@@ -94,14 +110,15 @@ BTRFS_WORK_HELPER(scrubnc_helper);
  BTRFS_WORK_HELPER(scrubparity_helper);
  
  static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
-                        int thresh)
+__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
+                       unsigned int flags, int limit_active, int thresh)
  {
         struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
  
         if (!ret)
                 return NULL;
  
+       ret->fs_info = fs_info;
         ret->limit_active = limit_active;
         atomic_set(&ret->pending, 0);
         if (thresh == 0)
@@ -143,7 +160,8 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
  static inline void
  __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
  
-struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
+                                             const char *name,
                                               unsigned int flags,
                                               int limit_active,
                                               int thresh)
@@ -153,7 +171,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
         if (!ret)
                 return NULL;
  
-       ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
+       ret->normal = __btrfs_alloc_workqueue(fs_info, name,
+                                             flags & ~WQ_HIGHPRI,
                                               limit_active, thresh);
         if (!ret->normal) {
                 kfree(ret);
@@ -161,8 +180,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
         }
  
         if (flags & WQ_HIGHPRI) {
-               ret->high = __btrfs_alloc_workqueue(name, flags, limit_active,
-                                                   thresh);
+               ret->high = __btrfs_alloc_workqueue(fs_info, name, flags,
+                                                   limit_active, thresh);
                 if (!ret->high) {
                         __btrfs_destroy_workqueue(ret->normal);
                         kfree(ret);
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h

index ad4d0647d1a6c03b6b3ba9d1b4aae9a1ceff6df5..8e52484cd4615544a9d25687ba33f36a56c29d52 100644 (file)
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -21,6 +21,7 @@
  #define __BTRFS_ASYNC_THREAD_
  #include <linux/workqueue.h>
  
+struct btrfs_fs_info;
  struct btrfs_workqueue;
  /* Internal use only */
  struct __btrfs_workqueue;
@@ -67,7 +68,8 @@ BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
  BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
  
  
-struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
+                                             const char *name,
                                               unsigned int flags,
                                               int limit_active,
                                               int thresh);
@@ -80,4 +82,6 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
  void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
  void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
  void btrfs_set_work_high_priority(struct btrfs_work *work);
+struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work);
+struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq);
  #endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c

index 8bb3509099e8fd802f871ecde0505ab5bd173005..2b88439c2ee864ffcec7a1e43f6e620a729bdbff 100644 (file)
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -139,7 +139,7 @@ int __init btrfs_prelim_ref_init(void)
         btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
                                         sizeof(struct __prelim_ref),
                                         0,
-                                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                       SLAB_MEM_SPREAD,
                                         NULL);
         if (!btrfs_prelim_ref_cache)
                 return -ENOMEM;
@@ -361,7 +361,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                 goto out;
         }
  
-       if (btrfs_test_is_dummy_root(root)) {
+       if (btrfs_is_testing(fs_info)) {
                 srcu_read_unlock(&fs_info->subvol_srcu, index);
                 ret = -ENOENT;
                 goto out;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c

index cefedabf0a92fd6aaa4c4986d4899fc9cac3b859..029db6e1105c7eb2bd42a4abc06a9f910bf9e346 100644 (file)
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -403,7 +403,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                         }
  
                         ret = btrfs_map_bio(root, bio, 0, 1);
-                       BUG_ON(ret); /* -ENOMEM */
+                       if (ret) {
+                               bio->bi_error = ret;
+                               bio_endio(bio);
+                       }
  
                         bio_put(bio);
  
@@ -434,7 +437,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
         }
  
         ret = btrfs_map_bio(root, bio, 0, 1);
-       BUG_ON(ret); /* -ENOMEM */
+       if (ret) {
+               bio->bi_error = ret;
+               bio_endio(bio);
+       }
  
         bio_put(bio);
         return 0;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c

index a85cf7d2330981d606352cc8218167570db5507a..d1c56c94dd5abe2c4d007d55936699629009b604 100644 (file)
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1153,14 +1153,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
  
         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 return ret;
         }
  
         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
                 if (ret) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         return ret;
                 }
         }
@@ -1198,7 +1198,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                 if (last_ref) {
                         ret = tree_mod_log_free_eb(root->fs_info, buf);
                         if (ret) {
-                               btrfs_abort_transaction(trans, root, ret);
+                               btrfs_abort_transaction(trans, ret);
                                 return ret;
                         }
                 }
@@ -1505,7 +1505,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root,
                                    struct extent_buffer *buf)
  {
-       if (btrfs_test_is_dummy_root(root))
+       if (btrfs_is_testing(root->fs_info))
                 return 0;
  
         /* ensure we can see the force_cow */
@@ -1771,6 +1771,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
         unsigned long map_len = 0;
         int err;
  
+       if (low > high) {
+               btrfs_err(eb->fs_info,
+                "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
+                         __func__, low, high, eb->start,
+                         btrfs_header_owner(eb), btrfs_header_level(eb));
+               return -EINVAL;
+       }
+
         while (low < high) {
                 mid = (low + high) / 2;
                 offset = p + mid * item_size;
@@ -1858,7 +1866,6 @@ static void root_sub_used(struct btrfs_root *root, u32 size)
  
  /* given a node and slot number, this reads the blocks it points to.  The
   * extent buffer is returned with a reference taken (but unlocked).
- * NULL is returned on error.
   */
  static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
                                    struct extent_buffer *parent, int slot)
@@ -1866,19 +1873,16 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
         int level = btrfs_header_level(parent);
         struct extent_buffer *eb;
  
-       if (slot < 0)
-               return NULL;
-       if (slot >= btrfs_header_nritems(parent))
-               return NULL;
+       if (slot < 0 || slot >= btrfs_header_nritems(parent))
+               return ERR_PTR(-ENOENT);
  
         BUG_ON(level == 0);
  
         eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
                              btrfs_node_ptr_generation(parent, slot));
-       if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) {
-               if (!IS_ERR(eb))
-                       free_extent_buffer(eb);
-               eb = NULL;
+       if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
+               free_extent_buffer(eb);
+               eb = ERR_PTR(-EIO);
         }
  
         return eb;
@@ -1931,8 +1935,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
  
                 /* promote the child to a root */
                 child = read_node_slot(root, mid, 0);
-               if (!child) {
-                       ret = -EROFS;
+               if (IS_ERR(child)) {
+                       ret = PTR_ERR(child);
                         btrfs_handle_fs_error(root->fs_info, ret, NULL);
                         goto enospc;
                 }
@@ -1970,6 +1974,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                 return 0;
  
         left = read_node_slot(root, parent, pslot - 1);
+       if (IS_ERR(left))
+               left = NULL;
+
         if (left) {
                 btrfs_tree_lock(left);
                 btrfs_set_lock_blocking(left);
@@ -1980,7 +1987,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                         goto enospc;
                 }
         }
+
         right = read_node_slot(root, parent, pslot + 1);
+       if (IS_ERR(right))
+               right = NULL;
+
         if (right) {
                 btrfs_tree_lock(right);
                 btrfs_set_lock_blocking(right);
@@ -2135,6 +2146,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                 return 1;
  
         left = read_node_slot(root, parent, pslot - 1);
+       if (IS_ERR(left))
+               left = NULL;
  
         /* first, try to make some room in the middle buffer */
         if (left) {
@@ -2185,6 +2198,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                 free_extent_buffer(left);
         }
         right = read_node_slot(root, parent, pslot + 1);
+       if (IS_ERR(right))
+               right = NULL;
  
         /*
          * then try to empty the right most buffer into the middle
@@ -3240,7 +3255,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,
         ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
                                    push_items);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 return ret;
         }
         copy_extent_buffer(dst, src,
@@ -3315,7 +3330,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
         ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
                                    src_nritems - push_items, push_items);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 return ret;
         }
         copy_extent_buffer(dst, src,
@@ -3519,7 +3534,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
         ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
                                    mid, c_nritems - mid);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 return ret;
         }
         copy_extent_buffer(split, c,
@@ -3773,7 +3788,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
         btrfs_assert_tree_locked(path->nodes[1]);
  
         right = read_node_slot(root, upper, slot + 1);
-       if (right == NULL)
+       /*
+        * slot + 1 is not valid or we fail to read the right node,
+        * no big deal, just return.
+        */
+       if (IS_ERR(right))
                 return 1;
  
         btrfs_tree_lock(right);
@@ -4003,7 +4022,11 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
         btrfs_assert_tree_locked(path->nodes[1]);
  
         left = read_node_slot(root, path->nodes[1], slot - 1);
-       if (left == NULL)
+       /*
+        * slot - 1 is not valid or we fail to read the left node,
+        * no big deal, just return.
+        */
+       if (IS_ERR(left))
                 return 1;
  
         btrfs_tree_lock(left);
@@ -5210,7 +5233,10 @@ find_next_key:
                 }
                 btrfs_set_path_blocking(path);
                 cur = read_node_slot(root, cur, slot);
-               BUG_ON(!cur); /* -ENOMEM */
+               if (IS_ERR(cur)) {
+                       ret = PTR_ERR(cur);
+                       goto out;
+               }
  
                 btrfs_tree_read_lock(cur);
  
@@ -5229,15 +5255,21 @@ out:
         return ret;
  }
  
-static void tree_move_down(struct btrfs_root *root,
+static int tree_move_down(struct btrfs_root *root,
                            struct btrfs_path *path,
                            int *level, int root_level)
  {
+       struct extent_buffer *eb;
+
         BUG_ON(*level == 0);
-       path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
-                                       path->slots[*level]);
+       eb = read_node_slot(root, path->nodes[*level], path->slots[*level]);
+       if (IS_ERR(eb))
+               return PTR_ERR(eb);
+
+       path->nodes[*level - 1] = eb;
         path->slots[*level - 1] = 0;
         (*level)--;
+       return 0;
  }
  
  static int tree_move_next_or_upnext(struct btrfs_root *root,
@@ -5282,8 +5314,7 @@ static int tree_advance(struct btrfs_root *root,
         if (*level == 0 || !allow_down) {
                 ret = tree_move_next_or_upnext(root, path, level, root_level);
         } else {
-               tree_move_down(root, path, level, root_level);
-               ret = 0;
+               ret = tree_move_down(root, path, level, root_level);
         }
         if (ret >= 0) {
                 if (*level == 0)
@@ -5457,8 +5488,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                                         left_root_level,
                                         advance_left != ADVANCE_ONLY_NEXT,
                                         &left_key);
-                       if (ret < 0)
+                       if (ret == -1)
                                 left_end_reached = ADVANCE;
+                       else if (ret < 0)
+                               goto out;
                         advance_left = 0;
                 }
                 if (advance_right && !right_end_reached) {
@@ -5466,8 +5499,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                                         right_root_level,
                                         advance_right != ADVANCE_ONLY_NEXT,
                                         &right_key);
-                       if (ret < 0)
+                       if (ret == -1)
                                 right_end_reached = ADVANCE;
+                       else if (ret < 0)
+                               goto out;
                         advance_right = 0;
                 }
  
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index 443fcc4021141f0ea5f5f9bbe5a1cdca0fb6f4ee..2fe8f89091a3097aa8e55bcee4a20c7b43ab1ca2 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -117,6 +117,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
  #define BTRFS_FS_STATE_REMOUNTING      1
  #define BTRFS_FS_STATE_TRANS_ABORTED   2
  #define BTRFS_FS_STATE_DEV_REPLACING   3
+#define BTRFS_FS_STATE_DUMMY_FS_INFO   4
  
  #define BTRFS_BACKREF_REV_MAX          256
  #define BTRFS_BACKREF_REV_SHIFT                56
@@ -144,21 +145,6 @@ struct btrfs_header {
         u8 level;
  } __attribute__ ((__packed__));
  
-#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
-                                     sizeof(struct btrfs_header)) / \
-                                    sizeof(struct btrfs_key_ptr))
-#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
-#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize))
-#define BTRFS_FILE_EXTENT_INLINE_DATA_START            \
-               (offsetof(struct btrfs_file_extent_item, disk_bytenr))
-#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
-                                       sizeof(struct btrfs_item) - \
-                                       BTRFS_FILE_EXTENT_INLINE_DATA_START)
-#define BTRFS_MAX_XATTR_SIZE(r)        (BTRFS_LEAF_DATA_SIZE(r) - \
-                                sizeof(struct btrfs_item) -\
-                                sizeof(struct btrfs_dir_item))
-
-
  /*
   * this is a very generous portion of the super block, giving us
   * room to translate 14 chunks with 3 stripes each.
@@ -1114,12 +1100,11 @@ struct btrfs_subvolume_writers {
  #define BTRFS_ROOT_REF_COWS            1
  #define BTRFS_ROOT_TRACK_DIRTY         2
  #define BTRFS_ROOT_IN_RADIX            3
-#define BTRFS_ROOT_DUMMY_ROOT          4
-#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED        5
-#define BTRFS_ROOT_DEFRAG_RUNNING      6
-#define BTRFS_ROOT_FORCE_COW           7
-#define BTRFS_ROOT_MULTI_LOG_TASKS     8
-#define BTRFS_ROOT_DIRTY               9
+#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED        4
+#define BTRFS_ROOT_DEFRAG_RUNNING      5
+#define BTRFS_ROOT_FORCE_COW           6
+#define BTRFS_ROOT_MULTI_LOG_TASKS     7
+#define BTRFS_ROOT_DIRTY               8
  
  /*
   * in ram representation of the tree.  extent_root is used for all allocations
@@ -1181,8 +1166,10 @@ struct btrfs_root {
  
         u64 highest_objectid;
  
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
         /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
         u64 alloc_bytenr;
+#endif
  
         u64 defrag_trans_start;
         struct btrfs_key defrag_progress;
@@ -1259,6 +1246,39 @@ struct btrfs_root {
         atomic_t qgroup_meta_rsv;
  };
  
+static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize)
+{
+       return blocksize - sizeof(struct btrfs_header);
+}
+
+static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_root *root)
+{
+       return __BTRFS_LEAF_DATA_SIZE(root->nodesize);
+}
+
+static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_root *root)
+{
+       return BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+}
+
+static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_root *root)
+{
+       return BTRFS_LEAF_DATA_SIZE(root) / sizeof(struct btrfs_key_ptr);
+}
+
+#define BTRFS_FILE_EXTENT_INLINE_DATA_START            \
+               (offsetof(struct btrfs_file_extent_item, disk_bytenr))
+static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_root *root)
+{
+       return BTRFS_MAX_ITEM_SIZE(root) -
+              BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root)
+{
+       return BTRFS_MAX_ITEM_SIZE(root) - sizeof(struct btrfs_dir_item);
+}
+
  /*
   * Flags for mount options.
   *
@@ -1299,21 +1319,21 @@ struct btrfs_root {
  #define btrfs_clear_opt(o, opt)                ((o) &= ~BTRFS_MOUNT_##opt)
  #define btrfs_set_opt(o, opt)          ((o) |= BTRFS_MOUNT_##opt)
  #define btrfs_raw_test_opt(o, opt)     ((o) & BTRFS_MOUNT_##opt)
-#define btrfs_test_opt(root, opt)      ((root)->fs_info->mount_opt & \
+#define btrfs_test_opt(fs_info, opt)   ((fs_info)->mount_opt & \
                                          BTRFS_MOUNT_##opt)
  
-#define btrfs_set_and_info(root, opt, fmt, args...)                    \
+#define btrfs_set_and_info(fs_info, opt, fmt, args...)                 \
  {                                                                      \
-       if (!btrfs_test_opt(root, opt))                                 \
-               btrfs_info(root->fs_info, fmt, ##args);                 \
-       btrfs_set_opt(root->fs_info->mount_opt, opt);                   \
+       if (!btrfs_test_opt(fs_info, opt))                              \
+               btrfs_info(fs_info, fmt, ##args);                       \
+       btrfs_set_opt(fs_info->mount_opt, opt);                         \
  }
  
-#define btrfs_clear_and_info(root, opt, fmt, args...)                  \
+#define btrfs_clear_and_info(fs_info, opt, fmt, args...)               \
  {                                                                      \
-       if (btrfs_test_opt(root, opt))                                  \
-               btrfs_info(root->fs_info, fmt, ##args);                 \
-       btrfs_clear_opt(root->fs_info->mount_opt, opt);                 \
+       if (btrfs_test_opt(fs_info, opt))                               \
+               btrfs_info(fs_info, fmt, ##args);                       \
+       btrfs_clear_opt(fs_info->mount_opt, opt);                       \
  }
  
  #ifdef CONFIG_BTRFS_DEBUG
@@ -1321,9 +1341,9 @@ static inline int
  btrfs_should_fragment_free_space(struct btrfs_root *root,
                                  struct btrfs_block_group_cache *block_group)
  {
-       return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
+       return (btrfs_test_opt(root->fs_info, FRAGMENT_METADATA) &&
                 block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
-              (btrfs_test_opt(root, FRAGMENT_DATA) &&
+              (btrfs_test_opt(root->fs_info, FRAGMENT_DATA) &&
                 block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
  }
  #endif
@@ -2886,9 +2906,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
  int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
  
  /* root-item.c */
-int btrfs_find_root_ref(struct btrfs_root *tree_root,
-                       struct btrfs_path *path,
-                       u64 root_id, u64 ref_id);
  int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *tree_root,
                        u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
@@ -3362,23 +3379,23 @@ const char *btrfs_decode_error(int errno);
  
  __cold
  void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root, const char *function,
+                              const char *function,
                                unsigned int line, int errno);
  
  /*
   * Call btrfs_abort_transaction as early as possible when an error condition is
   * detected, that way the exact line number is reported.
   */
-#define btrfs_abort_transaction(trans, root, errno)            \
+#define btrfs_abort_transaction(trans, errno)          \
  do {                                                           \
         /* Report first abort since mount */                    \
         if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,     \
-                       &((root)->fs_info->fs_state))) {        \
+                       &((trans)->fs_info->fs_state))) {       \
                 WARN(1, KERN_DEBUG                              \
                 "BTRFS: Transaction aborted (error %d)\n",      \
                 (errno));                                       \
         }                                                       \
-       __btrfs_abort_transaction((trans), (root), __func__,    \
+       __btrfs_abort_transaction((trans), __func__,            \
                                   __LINE__, (errno));           \
  } while (0)
  
@@ -3610,13 +3627,13 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
  void btrfs_test_destroy_inode(struct inode *inode);
  #endif
  
-static inline int btrfs_test_is_dummy_root(struct btrfs_root *root)
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
  {
  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-       if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+       if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
+                             &fs_info->fs_state)))
                 return 1;
  #endif
         return 0;
  }
-
  #endif
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h

new file mode 100644 (file)

index 0000000..83ebfe2
--- /dev/null
+++ b/fs/btrfs/dedupe.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) 2016 Fujitsu.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_DEDUPE__
+#define __BTRFS_DEDUPE__
+
+/* later in-band dedupe will expand this struct */
+struct btrfs_dedupe_hash;
+#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c

index dd3c040139a2ffb38957177b383790d2a6782e51..3eeb9cd8cfa57edc81bdeba91b1d37a3011d7bf9 100644 (file)
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -34,7 +34,7 @@ int __init btrfs_delayed_inode_init(void)
         delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
                                         sizeof(struct btrfs_delayed_node),
                                         0,
-                                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                       SLAB_MEM_SPREAD,
                                         NULL);
         if (!delayed_node_cache)
                 return -ENOMEM;
@@ -1170,7 +1170,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
                 if (ret) {
                         btrfs_release_delayed_node(curr_node);
                         curr_node = NULL;
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         break;
                 }
  
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c

index 430b3689b112b745d531109f64f1dc3ac10bca78..b6d210e7a993fd67634b3523aa3e61a1121d31bd 100644 (file)
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -606,7 +606,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
                 qrecord->num_bytes = num_bytes;
                 qrecord->old_roots = NULL;
  
-               qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs,
+               qexisting = btrfs_qgroup_insert_dirty_extent(fs_info,
+                                                            delayed_refs,
                                                              qrecord);
                 if (qexisting)
                         kfree(qrecord);
@@ -615,7 +616,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
         spin_lock_init(&head_ref->lock);
         mutex_init(&head_ref->mutex);
  
-       trace_add_delayed_ref_head(ref, head_ref, action);
+       trace_add_delayed_ref_head(fs_info, ref, head_ref, action);
  
         existing = htree_insert(&delayed_refs->href_root,
                                 &head_ref->href_node);
@@ -682,7 +683,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
                 ref->type = BTRFS_TREE_BLOCK_REF_KEY;
         full_ref->level = level;
  
-       trace_add_delayed_tree_ref(ref, full_ref, action);
+       trace_add_delayed_tree_ref(fs_info, ref, full_ref, action);
  
         ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
  
@@ -739,7 +740,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
         full_ref->objectid = owner;
         full_ref->offset = offset;
  
-       trace_add_delayed_data_ref(ref, full_ref, action);
+       trace_add_delayed_data_ref(fs_info, ref, full_ref, action);
  
         ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
  
@@ -940,28 +941,28 @@ int btrfs_delayed_ref_init(void)
         btrfs_delayed_ref_head_cachep = kmem_cache_create(
                                 "btrfs_delayed_ref_head",
                                 sizeof(struct btrfs_delayed_ref_head), 0,
-                               SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+                               SLAB_MEM_SPREAD, NULL);
         if (!btrfs_delayed_ref_head_cachep)
                 goto fail;
  
         btrfs_delayed_tree_ref_cachep = kmem_cache_create(
                                 "btrfs_delayed_tree_ref",
                                 sizeof(struct btrfs_delayed_tree_ref), 0,
-                               SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+                               SLAB_MEM_SPREAD, NULL);
         if (!btrfs_delayed_tree_ref_cachep)
                 goto fail;
  
         btrfs_delayed_data_ref_cachep = kmem_cache_create(
                                 "btrfs_delayed_data_ref",
                                 sizeof(struct btrfs_delayed_data_ref), 0,
-                               SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+                               SLAB_MEM_SPREAD, NULL);
         if (!btrfs_delayed_data_ref_cachep)
                 goto fail;
  
         btrfs_delayed_extent_op_cachep = kmem_cache_create(
                                 "btrfs_delayed_extent_op",
                                 sizeof(struct btrfs_delayed_extent_op), 0,
-                               SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+                               SLAB_MEM_SPREAD, NULL);
         if (!btrfs_delayed_extent_op_cachep)
                 goto fail;
  
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c

index 63ef9cdf0144dd00f226d1a98069f58b916f2dc4..e9bbff3c0029c57814207014c5d29c4b725a73b3 100644 (file)
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -142,7 +142,7 @@ no_valid_dev_replace_entry_found:
                  * missing
                  */
                 if (!dev_replace->srcdev &&
-                   !btrfs_test_opt(dev_root, DEGRADED)) {
+                   !btrfs_test_opt(dev_root->fs_info, DEGRADED)) {
                         ret = -EIO;
                         btrfs_warn(fs_info,
                            "cannot mount because device replace operation is ongoing and");
@@ -151,7 +151,7 @@ no_valid_dev_replace_entry_found:
                            src_devid);
                 }
                 if (!dev_replace->tgtdev &&
-                   !btrfs_test_opt(dev_root, DEGRADED)) {
+                   !btrfs_test_opt(dev_root->fs_info, DEGRADED)) {
                         ret = -EIO;
                         btrfs_warn(fs_info,
                            "cannot mount because device replace operation is ongoing and");
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 9a726ded2c6d150e73bd2719a09cb95a8634ab57..87dad552e39ae13ea2533c4a5718a5af635bcbee 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -101,7 +101,7 @@ int __init btrfs_end_io_wq_init(void)
         btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
                                         sizeof(struct btrfs_end_io_wq),
                                         0,
-                                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                       SLAB_MEM_SPREAD,
                                         NULL);
         if (!btrfs_end_io_wq_cache)
                 return -ENOMEM;
@@ -1140,7 +1140,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
  struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
                                                  u64 bytenr)
  {
-       if (btrfs_test_is_dummy_root(root))
+       if (btrfs_is_testing(root->fs_info))
                 return alloc_test_extent_buffer(root->fs_info, bytenr,
                                 root->nodesize);
         return alloc_extent_buffer(root->fs_info, bytenr);
@@ -1227,6 +1227,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
                          struct btrfs_root *root, struct btrfs_fs_info *fs_info,
                          u64 objectid)
  {
+       bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
         root->node = NULL;
         root->commit_root = NULL;
         root->sectorsize = sectorsize;
@@ -1281,14 +1282,14 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
         root->log_transid = 0;
         root->log_transid_committed = -1;
         root->last_log_commit = 0;
-       if (fs_info)
+       if (!dummy)
                 extent_io_tree_init(&root->dirty_log_pages,
                                      fs_info->btree_inode->i_mapping);
  
         memset(&root->root_key, 0, sizeof(root->root_key));
         memset(&root->root_item, 0, sizeof(root->root_item));
         memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
-       if (fs_info)
+       if (!dummy)
                 root->defrag_trans_start = fs_info->generation;
         else
                 root->defrag_trans_start = 0;
@@ -1309,17 +1310,20 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
  
  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
  /* Should only be used by the testing infrastructure */
-struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize)
+struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info,
+                                         u32 sectorsize, u32 nodesize)
  {
         struct btrfs_root *root;
  
-       root = btrfs_alloc_root(NULL, GFP_KERNEL);
+       if (!fs_info)
+               return ERR_PTR(-EINVAL);
+
+       root = btrfs_alloc_root(fs_info, GFP_KERNEL);
         if (!root)
                 return ERR_PTR(-ENOMEM);
         /* We don't use the stripesize in selftest, set it as sectorsize */
-       __setup_root(nodesize, sectorsize, sectorsize, root, NULL,
+       __setup_root(nodesize, sectorsize, sectorsize, root, fs_info,
                         BTRFS_ROOT_TREE_OBJECTID);
-       set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
         root->alloc_bytenr = 0;
  
         return root;
@@ -1594,14 +1598,14 @@ int btrfs_init_fs_root(struct btrfs_root *root)
  
         ret = get_anon_bdev(&root->anon_dev);
         if (ret)
-               goto free_writers;
+               goto fail;
  
         mutex_lock(&root->objectid_mutex);
         ret = btrfs_find_highest_objectid(root,
                                         &root->highest_objectid);
         if (ret) {
                 mutex_unlock(&root->objectid_mutex);
-               goto free_root_dev;
+               goto fail;
         }
  
         ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
@@ -1609,14 +1613,8 @@ int btrfs_init_fs_root(struct btrfs_root *root)
         mutex_unlock(&root->objectid_mutex);
  
         return 0;
-
-free_root_dev:
-       free_anon_bdev(root->anon_dev);
-free_writers:
-       btrfs_free_subvolume_writers(root->subv_writers);
  fail:
-       kfree(root->free_ino_ctl);
-       kfree(root->free_ino_pinned);
+       /* the caller is responsible to call free_fs_root */
         return ret;
  }
  
@@ -2310,17 +2308,19 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
         unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
  
         fs_info->workers =
-               btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
-                                     max_active, 16);
+               btrfs_alloc_workqueue(fs_info, "worker",
+                                     flags | WQ_HIGHPRI, max_active, 16);
  
         fs_info->delalloc_workers =
-               btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
+               btrfs_alloc_workqueue(fs_info, "delalloc",
+                                     flags, max_active, 2);
  
         fs_info->flush_workers =
-               btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
+               btrfs_alloc_workqueue(fs_info, "flush_delalloc",
+                                     flags, max_active, 0);
  
         fs_info->caching_workers =
-               btrfs_alloc_workqueue("cache", flags, max_active, 0);
+               btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
  
         /*
          * a higher idle thresh on the submit workers makes it much more
@@ -2328,41 +2328,48 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
          * devices
          */
         fs_info->submit_workers =
-               btrfs_alloc_workqueue("submit", flags,
+               btrfs_alloc_workqueue(fs_info, "submit", flags,
                                       min_t(u64, fs_devices->num_devices,
                                             max_active), 64);
  
         fs_info->fixup_workers =
-               btrfs_alloc_workqueue("fixup", flags, 1, 0);
+               btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
  
         /*
          * endios are largely parallel and should have a very
          * low idle thresh
          */
         fs_info->endio_workers =
-               btrfs_alloc_workqueue("endio", flags, max_active, 4);
+               btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
         fs_info->endio_meta_workers =
-               btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+               btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
+                                     max_active, 4);
         fs_info->endio_meta_write_workers =
-               btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+               btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
+                                     max_active, 2);
         fs_info->endio_raid56_workers =
-               btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+               btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
+                                     max_active, 4);
         fs_info->endio_repair_workers =
-               btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
+               btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0);
         fs_info->rmw_workers =
-               btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+               btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
         fs_info->endio_write_workers =
-               btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+               btrfs_alloc_workqueue(fs_info, "endio-write", flags,
+                                     max_active, 2);
         fs_info->endio_freespace_worker =
-               btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+               btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
+                                     max_active, 0);
         fs_info->delayed_workers =
-               btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+               btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
+                                     max_active, 0);
         fs_info->readahead_workers =
-               btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+               btrfs_alloc_workqueue(fs_info, "readahead", flags,
+                                     max_active, 2);
         fs_info->qgroup_rescan_workers =
-               btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+               btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
         fs_info->extent_workers =
-               btrfs_alloc_workqueue("extent-refs", flags,
+               btrfs_alloc_workqueue(fs_info, "extent-refs", flags,
                                       min_t(u64, fs_devices->num_devices,
                                             max_active), 8);
  
@@ -3010,8 +3017,8 @@ retry_root_backup:
         if (IS_ERR(fs_info->transaction_kthread))
                 goto fail_cleaner;
  
-       if (!btrfs_test_opt(tree_root, SSD) &&
-           !btrfs_test_opt(tree_root, NOSSD) &&
+       if (!btrfs_test_opt(tree_root->fs_info, SSD) &&
+           !btrfs_test_opt(tree_root->fs_info, NOSSD) &&
             !fs_info->fs_devices->rotating) {
                 btrfs_info(fs_info, "detected SSD devices, enabling SSD mode");
                 btrfs_set_opt(fs_info->mount_opt, SSD);
@@ -3024,9 +3031,9 @@ retry_root_backup:
         btrfs_apply_pending_changes(fs_info);
  
  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-       if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+       if (btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY)) {
                 ret = btrfsic_mount(tree_root, fs_devices,
-                                   btrfs_test_opt(tree_root,
+                                   btrfs_test_opt(tree_root->fs_info,
                                         CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
                                     1 : 0,
                                     fs_info->check_integrity_print_mask);
@@ -3042,7 +3049,7 @@ retry_root_backup:
  
         /* do not make disk changes in broken FS or nologreplay is given */
         if (btrfs_super_log_root(disk_super) != 0 &&
-           !btrfs_test_opt(tree_root, NOLOGREPLAY)) {
+           !btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) {
                 ret = btrfs_replay_log(fs_info, fs_devices);
                 if (ret) {
                         err = ret;
@@ -3083,7 +3090,7 @@ retry_root_backup:
         if (sb->s_flags & MS_RDONLY)
                 return 0;
  
-       if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
+       if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) &&
             !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
                 btrfs_info(fs_info, "creating free space tree");
                 ret = btrfs_create_free_space_tree(fs_info);
@@ -3120,7 +3127,7 @@ retry_root_backup:
  
         btrfs_qgroup_rescan_resume(fs_info);
  
-       if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
+       if (btrfs_test_opt(tree_root->fs_info, CLEAR_CACHE) &&
             btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
                 btrfs_info(fs_info, "clearing free space tree");
                 ret = btrfs_clear_free_space_tree(fs_info);
@@ -3141,7 +3148,7 @@ retry_root_backup:
                         close_ctree(tree_root);
                         return ret;
                 }
-       } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
+       } else if (btrfs_test_opt(tree_root->fs_info, RESCAN_UUID_TREE) ||
                    fs_info->generation !=
                                 btrfs_super_uuid_tree_generation(disk_super)) {
                 btrfs_info(fs_info, "checking UUID tree");
@@ -3218,7 +3225,7 @@ fail:
         return err;
  
  recovery_tree_root:
-       if (!btrfs_test_opt(tree_root, USEBACKUPROOT))
+       if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT))
                 goto fail_tree_roots;
  
         free_root_pointers(fs_info, 0);
@@ -3634,7 +3641,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
         int total_errors = 0;
         u64 flags;
  
-       do_barriers = !btrfs_test_opt(root, NOBARRIER);
+       do_barriers = !btrfs_test_opt(root->fs_info, NOBARRIER);
         backup_super_roots(root->fs_info);
  
         sb = root->fs_info->super_for_commit;
@@ -3918,7 +3925,7 @@ void close_ctree(struct btrfs_root *root)
         iput(fs_info->btree_inode);
  
  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-       if (btrfs_test_opt(root, CHECK_INTEGRITY))
+       if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY))
                 btrfsic_unmount(root, fs_info->fs_devices);
  #endif
  
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h

index dbf3e1aab69e904a75a6828bbcb6f955166c8b25..b3207a0e09f7966703e1d130e250f05f610b2a3f 100644 (file)
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -90,7 +90,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
  void btrfs_free_fs_root(struct btrfs_root *root);
  
  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize);
+struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info,
+                                         u32 sectorsize, u32 nodesize);
  #endif
  
  /*
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index e9376b1657e25ff18f72d8f41bc37e1d520d3f01..61b494e8e604e9e80f1c46c581ede724f56ea050 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2180,7 +2180,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                                     path, bytenr, parent, root_objectid,
                                     owner, offset, refs_to_add);
         if (ret)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
  out:
         btrfs_free_path(path);
         return ret;
@@ -2204,7 +2204,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
         ins.type = BTRFS_EXTENT_ITEM_KEY;
  
         ref = btrfs_delayed_node_to_data_ref(node);
-       trace_run_delayed_data_ref(node, ref, node->action);
+       trace_run_delayed_data_ref(root->fs_info, node, ref, node->action);
  
         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
                 parent = ref->parent;
@@ -2359,7 +2359,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
                                                  SKINNY_METADATA);
  
         ref = btrfs_delayed_node_to_tree_ref(node);
-       trace_run_delayed_tree_ref(node, ref, node->action);
+       trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action);
  
         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
                 parent = ref->parent;
@@ -2423,7 +2423,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                  */
                 BUG_ON(extent_op);
                 head = btrfs_delayed_node_to_head(node);
-               trace_run_delayed_ref_head(node, head, node->action);
+               trace_run_delayed_ref_head(root->fs_info, node, head,
+                                          node->action);
  
                 if (insert_reserved) {
                         btrfs_pin_extent(root, node->bytenr,
@@ -2778,7 +2779,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
         u64 num_csums_per_leaf;
         u64 num_csums;
  
-       csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+       csum_size = BTRFS_MAX_ITEM_SIZE(root);
         num_csums_per_leaf = div64_u64(csum_size,
                         (u64)btrfs_super_csum_size(root->fs_info->super_copy));
         num_csums = div64_u64(csum_bytes, root->sectorsize);
@@ -2970,7 +2971,7 @@ again:
         trans->can_flush_pending_bgs = false;
         ret = __btrfs_run_delayed_refs(trans, root, count);
         if (ret < 0) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 return ret;
         }
  
@@ -3234,7 +3235,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                             u64, u64, u64, u64, u64, u64);
  
  
-       if (btrfs_test_is_dummy_root(root))
+       if (btrfs_is_testing(root->fs_info))
                 return 0;
  
         ref_root = btrfs_header_owner(buf);
@@ -3429,7 +3430,7 @@ again:
                  * transaction, this only happens in really bad situations
                  * anyway.
                  */
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_put;
         }
         WARN_ON(ret);
@@ -3447,7 +3448,7 @@ again:
  
         spin_lock(&block_group->lock);
         if (block_group->cached != BTRFS_CACHE_FINISHED ||
-           !btrfs_test_opt(root, SPACE_CACHE)) {
+           !btrfs_test_opt(root->fs_info, SPACE_CACHE)) {
                 /*
                  * don't bother trying to write stuff out _if_
                  * a) we're not cached,
@@ -3524,7 +3525,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
         struct btrfs_path *path;
  
         if (list_empty(&cur_trans->dirty_bgs) ||
-           !btrfs_test_opt(root, SPACE_CACHE))
+           !btrfs_test_opt(root->fs_info, SPACE_CACHE))
                 return 0;
  
         path = btrfs_alloc_path();
@@ -3669,7 +3670,7 @@ again:
                                 }
                                 spin_unlock(&cur_trans->dirty_bgs_lock);
                         } else if (ret) {
-                               btrfs_abort_transaction(trans, root, ret);
+                               btrfs_abort_transaction(trans, ret);
                         }
                 }
  
@@ -3815,7 +3816,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                                             cache);
                         }
                         if (ret)
-                               btrfs_abort_transaction(trans, root, ret);
+                               btrfs_abort_transaction(trans, ret);
                 }
  
                 /* if its not on the io list, we need to put the block group */
@@ -4443,7 +4444,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
         thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
                 btrfs_calc_trans_metadata_size(root, 1);
  
-       if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
+       if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
                 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
                         left, thresh, type);
                 dump_space_info(info, 0, 0);
@@ -4588,7 +4589,7 @@ out:
          */
         if (trans->can_flush_pending_bgs &&
             trans->chunk_bytes_reserved >= (u64)SZ_2M) {
-               btrfs_create_pending_block_groups(trans, trans->root);
+               btrfs_create_pending_block_groups(trans, extent_root);
                 btrfs_trans_release_chunk_metadata(trans);
         }
         return ret;
@@ -5729,7 +5730,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
   */
  void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
  {
-       struct btrfs_fs_info *fs_info = trans->root->fs_info;
+       struct btrfs_fs_info *fs_info = trans->fs_info;
  
         if (!trans->chunk_bytes_reserved)
                 return;
@@ -6100,7 +6101,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
         if (dropped > 0)
                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
  
-       if (btrfs_test_is_dummy_root(root))
+       if (btrfs_is_testing(root->fs_info))
                 return;
  
         trace_btrfs_space_reservation(root->fs_info, "delalloc",
@@ -6215,7 +6216,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                 spin_lock(&cache->space_info->lock);
                 spin_lock(&cache->lock);
  
-               if (btrfs_test_opt(root, SPACE_CACHE) &&
+               if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
                     cache->disk_cache_state < BTRFS_DC_CLEAR)
                         cache->disk_cache_state = BTRFS_DC_CLEAR;
  
@@ -6597,7 +6598,7 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
                    u64 *empty_cluster)
  {
         struct btrfs_free_cluster *ret = NULL;
-       bool ssd = btrfs_test_opt(root, SSD);
+       bool ssd = btrfs_test_opt(root->fs_info, SSD);
  
         *empty_cluster = 0;
         if (btrfs_mixed_space_info(space_info))
@@ -6742,7 +6743,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                         break;
                 }
  
-               if (btrfs_test_opt(root, DISCARD))
+               if (btrfs_test_opt(root->fs_info, DISCARD))
                         ret = btrfs_discard_extent(root, start,
                                                    end + 1 - start, NULL);
  
@@ -6880,7 +6881,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                                     NULL, refs_to_drop,
                                                     is_data, &last_ref);
                         if (ret) {
-                               btrfs_abort_transaction(trans, extent_root, ret);
+                               btrfs_abort_transaction(trans, ret);
                                 goto out;
                         }
                         btrfs_release_path(path);
@@ -6929,7 +6930,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                                          path->nodes[0]);
                         }
                         if (ret < 0) {
-                               btrfs_abort_transaction(trans, extent_root, ret);
+                               btrfs_abort_transaction(trans, ret);
                                 goto out;
                         }
                         extent_slot = path->slots[0];
@@ -6940,10 +6941,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
                         bytenr, parent, root_objectid, owner_objectid,
                         owner_offset);
-               btrfs_abort_transaction(trans, extent_root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out;
         } else {
-               btrfs_abort_transaction(trans, extent_root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out;
         }
  
@@ -6955,7 +6956,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                 ret = convert_extent_item_v0(trans, extent_root, path,
                                              owner_objectid, 0);
                 if (ret < 0) {
-                       btrfs_abort_transaction(trans, extent_root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
  
@@ -6974,7 +6975,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                         btrfs_print_leaf(extent_root, path->nodes[0]);
                 }
                 if (ret < 0) {
-                       btrfs_abort_transaction(trans, extent_root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
  
@@ -6999,7 +7000,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                 btrfs_err(info, "trying to drop %d refs but we only have %Lu "
                           "for bytenr %Lu", refs_to_drop, refs, bytenr);
                 ret = -EINVAL;
-               btrfs_abort_transaction(trans, extent_root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out;
         }
         refs -= refs_to_drop;
@@ -7022,7 +7023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                                     iref, refs_to_drop,
                                                     is_data, &last_ref);
                         if (ret) {
-                               btrfs_abort_transaction(trans, extent_root, ret);
+                               btrfs_abort_transaction(trans, ret);
                                 goto out;
                         }
                 }
@@ -7045,7 +7046,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                       num_to_del);
                 if (ret) {
-                       btrfs_abort_transaction(trans, extent_root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
                 btrfs_release_path(path);
@@ -7053,7 +7054,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                 if (is_data) {
                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                         if (ret) {
-                               btrfs_abort_transaction(trans, extent_root, ret);
+                               btrfs_abort_transaction(trans, ret);
                                 goto out;
                         }
                 }
@@ -7061,13 +7062,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                 ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
                                              num_bytes);
                 if (ret) {
-                       btrfs_abort_transaction(trans, extent_root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
  
                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
                 if (ret) {
-                       btrfs_abort_transaction(trans, extent_root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
         }
@@ -7216,7 +7217,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
         int ret;
         struct btrfs_fs_info *fs_info = root->fs_info;
  
-       if (btrfs_test_is_dummy_root(root))
+       if (btrfs_is_testing(fs_info))
                 return 0;
  
         add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
@@ -7851,8 +7852,7 @@ loop:
                          * can do more things.
                          */
                         if (ret < 0 && ret != -ENOSPC)
-                               btrfs_abort_transaction(trans,
-                                                       root, ret);
+                               btrfs_abort_transaction(trans, ret);
                         else
                                 ret = 0;
                         if (!exist)
@@ -7906,8 +7906,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
         printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
                info->flags,
                info->total_bytes - info->bytes_used - info->bytes_pinned -
-              info->bytes_reserved - info->bytes_readonly,
-              (info->full) ? "" : "not ");
+              info->bytes_reserved - info->bytes_readonly -
+              info->bytes_may_use, (info->full) ? "" : "not ");
         printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
                "reserved=%llu, may_use=%llu, readonly=%llu\n",
                info->total_bytes, info->bytes_used, info->bytes_pinned,
@@ -7961,7 +7961,7 @@ again:
                         if (num_bytes == min_alloc_size)
                                 final_tried = true;
                         goto again;
-               } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+               } else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
                         struct btrfs_space_info *sinfo;
  
                         sinfo = __find_space_info(root->fs_info, flags);
@@ -7992,7 +7992,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
         if (pin)
                 pin_down_extent(root, cache, start, len, 1);
         else {
-               if (btrfs_test_opt(root, DISCARD))
+               if (btrfs_test_opt(root->fs_info, DISCARD))
                         ret = btrfs_discard_extent(root, start, len, NULL);
                 btrfs_add_free_space(cache, start, len);
                 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
@@ -8300,7 +8300,7 @@ again:
                 goto again;
         }
  
-       if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+       if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
                 static DEFINE_RATELIMIT_STATE(_rs,
                                 DEFAULT_RATELIMIT_INTERVAL * 10,
                                 /*DEFAULT_RATELIMIT_BURST*/ 1);
@@ -8354,13 +8354,15 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
                                                  SKINNY_METADATA);
  
-       if (btrfs_test_is_dummy_root(root)) {
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+       if (btrfs_is_testing(root->fs_info)) {
                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
                                             level);
                 if (!IS_ERR(buf))
                         root->alloc_bytenr += blocksize;
                 return buf;
         }
+#endif
  
         block_rsv = use_block_rsv(trans, root, blocksize);
         if (IS_ERR(block_rsv))
@@ -8540,7 +8542,8 @@ static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
  
         delayed_refs = &trans->transaction->delayed_refs;
         spin_lock(&delayed_refs->lock);
-       if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+       if (btrfs_qgroup_insert_dirty_extent(trans->fs_info,
+                                            delayed_refs, qrecord))
                 kfree(qrecord);
         spin_unlock(&delayed_refs->lock);
  
@@ -9325,7 +9328,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                                 &root->root_key,
                                                 root_item);
                         if (ret) {
-                               btrfs_abort_transaction(trans, tree_root, ret);
+                               btrfs_abort_transaction(trans, ret);
                                 err = ret;
                                 goto out_end_trans;
                         }
@@ -9352,7 +9355,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
  
         ret = btrfs_del_root(trans, tree_root, &root->root_key);
         if (ret) {
-               btrfs_abort_transaction(trans, tree_root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_end_trans;
         }
  
@@ -9360,7 +9363,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                 ret = btrfs_find_root(tree_root, &root->root_key, path,
                                       NULL, NULL);
                 if (ret < 0) {
-                       btrfs_abort_transaction(trans, tree_root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         err = ret;
                         goto out_end_trans;
                 } else if (ret > 0) {
@@ -9731,7 +9734,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
         int full = 0;
         int ret = 0;
  
-       debug = btrfs_test_opt(root, ENOSPC_DEBUG);
+       debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG);
  
         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
  
@@ -9887,7 +9890,22 @@ static int find_first_block_group(struct btrfs_root *root,
  
                 if (found_key.objectid >= key->objectid &&
                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
-                       ret = 0;
+                       struct extent_map_tree *em_tree;
+                       struct extent_map *em;
+
+                       em_tree = &root->fs_info->mapping_tree.map_tree;
+                       read_lock(&em_tree->lock);
+                       em = lookup_extent_mapping(em_tree, found_key.objectid,
+                                                  found_key.offset);
+                       read_unlock(&em_tree->lock);
+                       if (!em) {
+                               btrfs_err(root->fs_info,
+                       "logical %llu len %llu found bg but no related chunk",
+                                         found_key.objectid, found_key.offset);
+                               ret = -ENOENT;
+                       } else {
+                               ret = 0;
+                       }
                         goto out;
                 }
                 path->slots[0]++;
@@ -10129,10 +10147,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
         path->reada = READA_FORWARD;
  
         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
-       if (btrfs_test_opt(root, SPACE_CACHE) &&
+       if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
             btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
                 need_clear = 1;
-       if (btrfs_test_opt(root, CLEAR_CACHE))
+       if (btrfs_test_opt(root->fs_info, CLEAR_CACHE))
                 need_clear = 1;
  
         while (1) {
@@ -10163,7 +10181,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                          * b) Setting 'dirty flag' makes sure that we flush
                          *    the new space cache info onto disk.
                          */
-                       if (btrfs_test_opt(root, SPACE_CACHE))
+                       if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
                                 cache->disk_cache_state = BTRFS_DC_CLEAR;
                 }
  
@@ -10305,11 +10323,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
                                         sizeof(item));
                 if (ret)
-                       btrfs_abort_transaction(trans, extent_root, ret);
+                       btrfs_abort_transaction(trans, ret);
                 ret = btrfs_finish_chunk_alloc(trans, extent_root,
                                                key.objectid, key.offset);
                 if (ret)
-                       btrfs_abort_transaction(trans, extent_root, ret);
+                       btrfs_abort_transaction(trans, ret);
                 add_block_group_free_space(trans, root->fs_info, block_group);
                 /* already aborted the transaction if it failed. */
  next:
@@ -10622,7 +10640,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         spin_lock(&block_group->space_info->lock);
         list_del_init(&block_group->ro_list);
  
-       if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+       if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
                 WARN_ON(block_group->space_info->total_bytes
                         < block_group->key.offset);
                 WARN_ON(block_group->space_info->bytes_readonly
@@ -10890,7 +10908,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 spin_unlock(&space_info->lock);
  
                 /* DISCARD can flip during remount */
-               trimming = btrfs_test_opt(root, DISCARD);
+               trimming = btrfs_test_opt(root->fs_info, DISCARD);
  
                 /* Implicit trim during transaction commit. */
                 if (trimming)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index 881eb4667051cde17faf621907d5f28a397e7006..44fe66b53c8b450b4d615e06fc5e8c159bfc9885 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -163,13 +163,13 @@ int __init extent_io_init(void)
  {
         extent_state_cache = kmem_cache_create("btrfs_extent_state",
                         sizeof(struct extent_state), 0,
-                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+                       SLAB_MEM_SPREAD, NULL);
         if (!extent_state_cache)
                 return -ENOMEM;
  
         extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
                         sizeof(struct extent_buffer), 0,
-                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+                       SLAB_MEM_SPREAD, NULL);
         if (!extent_buffer_cache)
                 goto free_state_cache;
  
@@ -2750,7 +2750,6 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page,
         if (tree->ops && tree->ops->merge_bio_hook)
                 ret = tree->ops->merge_bio_hook(page, offset, size, bio,
                                                 bio_flags);
-       BUG_ON(ret < 0);
         return ret;
  
  }
@@ -2873,6 +2872,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
   * into the tree that are removed when the IO is done (by the end_io
   * handlers)
   * XXX JDM: This needs looking at to ensure proper page locking
+ * return 0 on success, otherwise return error
   */
  static int __do_readpage(struct extent_io_tree *tree,
                          struct page *page,
@@ -2894,7 +2894,7 @@ static int __do_readpage(struct extent_io_tree *tree,
         sector_t sector;
         struct extent_map *em;
         struct block_device *bdev;
-       int ret;
+       int ret = 0;
         int nr = 0;
         size_t pg_offset = 0;
         size_t iosize;
@@ -3075,6 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree,
                 } else {
                         SetPageError(page);
                         unlock_extent(tree, cur, cur + iosize - 1);
+                       goto out;
                 }
                 cur = cur + iosize;
                 pg_offset += iosize;
@@ -3085,7 +3086,7 @@ out:
                         SetPageUptodate(page);
                 unlock_page(page);
         }
-       return 0;
+       return ret;
  }
  
  static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
@@ -5224,14 +5225,31 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
         atomic_set(&eb->io_pages, num_reads);
         for (i = start_i; i < num_pages; i++) {
                 page = eb->pages[i];
+
                 if (!PageUptodate(page)) {
+                       if (ret) {
+                               atomic_dec(&eb->io_pages);
+                               unlock_page(page);
+                               continue;
+                       }
+
                         ClearPageError(page);
                         err = __extent_read_full_page(tree, page,
                                                       get_extent, &bio,
                                                       mirror_num, &bio_flags,
                                                       REQ_META);
-                       if (err)
+                       if (err) {
                                 ret = err;
+                               /*
+                                * We use &bio in above __extent_read_full_page,
+                                * so we ensure that if it returns error, the
+                                * current page fails to add itself to bio and
+                                * it's been unlocked.
+                                *
+                                * We must dec io_pages by ourselves.
+                                */
+                               atomic_dec(&eb->io_pages);
+                       }
                 } else {
                         unlock_page(page);
                 }
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c

index e0715fcfb11ee9ee3582510d037fa32a887616ea..26f9ac719d20b4bff1a6b0a456ca45dd1752b4c7 100644 (file)
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -13,7 +13,7 @@ int __init extent_map_init(void)
  {
         extent_map_cache = kmem_cache_create("btrfs_extent_map",
                         sizeof(struct extent_map), 0,
-                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+                       SLAB_MEM_SPREAD, NULL);
         if (!extent_map_cache)
                 return -ENOMEM;
         return 0;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c

index 62a81ee13a5f649f3d17139f8069395dbd512527..d0d571c47d33b8e876aa1906097678350a2ef74b 100644 (file)
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -250,7 +250,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
                                                 offset + root->sectorsize - 1,
                                                 EXTENT_NODATASUM);
                                 } else {
-                                       btrfs_info(BTRFS_I(inode)->root->fs_info,
+                                       btrfs_info_rl(BTRFS_I(inode)->root->fs_info,
                                                    "no csum found for inode %llu start %llu",
                                                btrfs_ino(inode), offset);
                                 }
@@ -699,7 +699,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                          */
                         ret = btrfs_split_item(trans, root, path, &key, offset);
                         if (ret && ret != -EAGAIN) {
-                               btrfs_abort_transaction(trans, root, ret);
+                               btrfs_abort_transaction(trans, ret);
                                 goto out;
                         }
  
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index bcfb4a27ddd4f82bda72bb789ba1fbdf5804f3a3..9404121fd5f7b44f165c6f76c856548cf5722aff 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -132,7 +132,7 @@ static int __btrfs_add_inode_defrag(struct inode *inode,
  
  static inline int __need_auto_defrag(struct btrfs_root *root)
  {
-       if (!btrfs_test_opt(root, AUTO_DEFRAG))
+       if (!btrfs_test_opt(root->fs_info, AUTO_DEFRAG))
                 return 0;
  
         if (btrfs_fs_closing(root->fs_info))
@@ -950,7 +950,7 @@ delete_extent_item:
                         ret = btrfs_del_items(trans, root, path, del_slot,
                                               del_nr);
                         if (ret) {
-                               btrfs_abort_transaction(trans, root, ret);
+                               btrfs_abort_transaction(trans, ret);
                                 break;
                         }
  
@@ -974,7 +974,7 @@ delete_extent_item:
                 path->slots[0] = del_slot;
                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
                 if (ret)
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
         }
  
         leaf = path->nodes[0];
@@ -1190,7 +1190,7 @@ again:
                         goto again;
                 }
                 if (ret < 0) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
  
@@ -1278,7 +1278,7 @@ again:
  
                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
                 if (ret < 0) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
         }
@@ -2975,7 +2975,7 @@ int btrfs_auto_defrag_init(void)
  {
         btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
                                         sizeof(struct inode_defrag), 0,
-                                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                       SLAB_MEM_SPREAD,
                                         NULL);
         if (!btrfs_inode_defrag_cachep)
                 return -ENOMEM;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c

index 69d270f6602c2de22a734af3f848ad03bf767906..d571bd2b697bf56ccb3f838a55002526ab9a8122 100644 (file)
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -280,7 +280,7 @@ fail:
         if (locked)
                 mutex_unlock(&trans->transaction->cache_write_mutex);
         if (ret)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
  
         return ret;
  }
@@ -3026,7 +3026,7 @@ int btrfs_find_space_cluster(struct btrfs_root *root,
          * For metadata, allow allocates with smaller extents.  For
          * data, keep it dense.
          */
-       if (btrfs_test_opt(root, SSD_SPREAD)) {
+       if (btrfs_test_opt(root->fs_info, SSD_SPREAD)) {
                 cont1_bytes = min_bytes = bytes + empty_size;
         } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
                 cont1_bytes = bytes;
@@ -3470,7 +3470,7 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
         int ret = 0;
         u64 root_gen = btrfs_root_generation(&root->root_item);
  
-       if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+       if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
                 return 0;
  
         /*
@@ -3514,7 +3514,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
         struct btrfs_io_ctl io_ctl;
         bool release_metadata = true;
  
-       if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+       if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
                 return 0;
  
         memset(&io_ctl, 0, sizeof(io_ctl));
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c

index 53dbeaf6ce941cc7a6a8bf06c6a81df354e9e364..87e7e3d3e6760ebb2b07ef73162ff1d83942d641 100644 (file)
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -305,7 +305,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
  out:
         kvfree(bitmap);
         if (ret)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
         return ret;
  }
  
@@ -454,7 +454,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
  out:
         kvfree(bitmap);
         if (ret)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
         return ret;
  }
  
@@ -851,7 +851,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
  out:
         btrfs_free_path(path);
         if (ret)
-               btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+               btrfs_abort_transaction(trans, ret);
         return ret;
  }
  
@@ -1047,7 +1047,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans,
  out:
         btrfs_free_path(path);
         if (ret)
-               btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+               btrfs_abort_transaction(trans, ret);
         return ret;
  }
  
@@ -1193,7 +1193,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
  
  abort:
         fs_info->creating_free_space_tree = 0;
-       btrfs_abort_transaction(trans, tree_root, ret);
+       btrfs_abort_transaction(trans, ret);
         btrfs_end_transaction(trans, tree_root);
         return ret;
  }
@@ -1280,7 +1280,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
         return 0;
  
  abort:
-       btrfs_abort_transaction(trans, tree_root, ret);
+       btrfs_abort_transaction(trans, ret);
         btrfs_end_transaction(trans, tree_root);
         return ret;
  }
@@ -1333,7 +1333,7 @@ out:
         btrfs_free_path(path);
         mutex_unlock(&block_group->free_space_lock);
         if (ret)
-               btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+               btrfs_abort_transaction(trans, ret);
         return ret;
  }
  
@@ -1410,7 +1410,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
  out:
         btrfs_free_path(path);
         if (ret)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
         return ret;
  }
  
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c

index 70107f7c930730c8054d45c153dec71cf9227270..aa6fabaee72ed488844fc042e98536a77e9148cf 100644 (file)
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -38,7 +38,7 @@ static int caching_kthread(void *data)
         int slot;
         int ret;
  
-       if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+       if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
                 return 0;
  
         path = btrfs_alloc_path();
@@ -141,7 +141,7 @@ static void start_caching(struct btrfs_root *root)
         int ret;
         u64 objectid;
  
-       if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+       if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
                 return;
  
         spin_lock(&root->ino_cache_lock);
@@ -185,7 +185,7 @@ static void start_caching(struct btrfs_root *root)
  
  int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
  {
-       if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+       if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
                 return btrfs_find_free_objectid(root, objectid);
  
  again:
@@ -211,7 +211,7 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
  {
         struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
  
-       if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+       if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
                 return;
  again:
         if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
@@ -251,7 +251,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
         struct rb_node *n;
         u64 count;
  
-       if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+       if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
                 return;
  
         while (1) {
@@ -412,7 +412,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
         if (btrfs_root_refs(&root->root_item) == 0)
                 return 0;
  
-       if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+       if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
                 return 0;
  
         path = btrfs_alloc_path();
@@ -458,7 +458,7 @@ again:
         BTRFS_I(inode)->generation = 0;
         ret = btrfs_update_inode(trans, root, inode);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_put;
         }
  
@@ -466,7 +466,7 @@ again:
                 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
                 if (ret) {
                         if (ret != -ENOSPC)
-                               btrfs_abort_transaction(trans, root, ret);
+                               btrfs_abort_transaction(trans, ret);
                         goto out_put;
                 }
         }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 8078077d109084aaa24a32f1a8914e36850f180d..b0f421f332ae911b731f4c4eb8705053705504ce 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -60,6 +60,7 @@
  #include "hash.h"
  #include "props.h"
  #include "qgroup.h"
+#include "dedupe.h"
  
  struct btrfs_iget_args {
         struct btrfs_key *location;
@@ -105,8 +106,9 @@ static int btrfs_truncate(struct inode *inode);
  static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
  static noinline int cow_file_range(struct inode *inode,
                                    struct page *locked_page,
-                                  u64 start, u64 end, int *page_started,
-                                  unsigned long *nr_written, int unlock);
+                                  u64 start, u64 end, u64 delalloc_end,
+                                  int *page_started, unsigned long *nr_written,
+                                  int unlock, struct btrfs_dedupe_hash *hash);
  static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
                                            u64 len, u64 orig_start,
                                            u64 block_start, u64 block_len,
@@ -294,7 +296,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
                                    start, aligned_end, NULL,
                                    1, 1, extent_item_size, &extent_inserted);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out;
         }
  
@@ -305,7 +307,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
                                    inline_len, compressed_size,
                                    compress_type, compressed_pages);
         if (ret && ret != -ENOSPC) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out;
         } else if (ret == -ENOSPC) {
                 ret = 1;
@@ -374,12 +376,12 @@ static inline int inode_need_compress(struct inode *inode)
         struct btrfs_root *root = BTRFS_I(inode)->root;
  
         /* force compress */
-       if (btrfs_test_opt(root, FORCE_COMPRESS))
+       if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
                 return 1;
         /* bad compression ratios */
         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
                 return 0;
-       if (btrfs_test_opt(root, COMPRESS) ||
+       if (btrfs_test_opt(root->fs_info, COMPRESS) ||
             BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
             BTRFS_I(inode)->force_compress)
                 return 1;
@@ -585,9 +587,27 @@ cont:
                         will_compress = 0;
                 } else {
                         num_bytes = total_in;
+                       *num_added += 1;
+
+                       /*
+                        * The async work queues will take care of doing actual
+                        * allocation on disk for these compressed pages, and
+                        * will submit them to the elevator.
+                        */
+                       add_async_extent(async_cow, start, num_bytes,
+                                       total_compressed, pages, nr_pages_ret,
+                                       compress_type);
+
+                       if (start + num_bytes < end) {
+                               start += num_bytes;
+                               pages = NULL;
+                               cond_resched();
+                               goto again;
+                       }
+                       return;
                 }
         }
-       if (!will_compress && pages) {
+       if (pages) {
                 /*
                  * the compression code ran but failed to make things smaller,
                  * free any pages it allocated and our page pointer array
@@ -602,48 +622,28 @@ cont:
                 nr_pages_ret = 0;
  
                 /* flag the file so we don't compress in the future */
-               if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
+               if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) &&
                     !(BTRFS_I(inode)->force_compress)) {
                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
                 }
         }
-       if (will_compress) {
-               *num_added += 1;
-
-               /* the async work queues will take care of doing actual
-                * allocation on disk for these compressed pages,
-                * and will submit them to the elevator.
-                */
-               add_async_extent(async_cow, start, num_bytes,
-                                total_compressed, pages, nr_pages_ret,
-                                compress_type);
-
-               if (start + num_bytes < end) {
-                       start += num_bytes;
-                       pages = NULL;
-                       cond_resched();
-                       goto again;
-               }
-       } else {
  cleanup_and_bail_uncompressed:
-               /*
-                * No compression, but we still need to write the pages in
-                * the file we've been given so far.  redirty the locked
-                * page if it corresponds to our extent and set things up
-                * for the async work queue to run cow_file_range to do
-                * the normal delalloc dance
-                */
-               if (page_offset(locked_page) >= start &&
-                   page_offset(locked_page) <= end) {
-                       __set_page_dirty_nobuffers(locked_page);
-                       /* unlocked later on in the async handlers */
-               }
-               if (redirty)
-                       extent_range_redirty_for_io(inode, start, end);
-               add_async_extent(async_cow, start, end - start + 1,
-                                0, NULL, 0, BTRFS_COMPRESS_NONE);
-               *num_added += 1;
-       }
+       /*
+        * No compression, but we still need to write the pages in the file
+        * we've been given so far.  redirty the locked page if it corresponds
+        * to our extent and set things up for the async work queue to run
+        * cow_file_range to do the normal delalloc dance.
+        */
+       if (page_offset(locked_page) >= start &&
+           page_offset(locked_page) <= end)
+               __set_page_dirty_nobuffers(locked_page);
+               /* unlocked later on in the async handlers */
+
+       if (redirty)
+               extent_range_redirty_for_io(inode, start, end);
+       add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
+                        BTRFS_COMPRESS_NONE);
+       *num_added += 1;
  
         return;
  
@@ -712,7 +712,10 @@ retry:
                                              async_extent->start,
                                              async_extent->start +
                                              async_extent->ram_size - 1,
-                                            &page_started, &nr_written, 0);
+                                            async_extent->start +
+                                            async_extent->ram_size - 1,
+                                            &page_started, &nr_written, 0,
+                                            NULL);
  
                         /* JDM XXX */
  
@@ -925,9 +928,9 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
   */
  static noinline int cow_file_range(struct inode *inode,
                                    struct page *locked_page,
-                                  u64 start, u64 end, int *page_started,
-                                  unsigned long *nr_written,
-                                  int unlock)
+                                  u64 start, u64 end, u64 delalloc_end,
+                                  int *page_started, unsigned long *nr_written,
+                                  int unlock, struct btrfs_dedupe_hash *hash)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         u64 alloc_hint = 0;
@@ -1156,7 +1159,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                 async_cow->start = start;
  
                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
-                   !btrfs_test_opt(root, FORCE_COMPRESS))
+                   !btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
                         cur_end = end;
                 else
                         cur_end = min(end, start + SZ_512K - 1);
@@ -1418,7 +1421,8 @@ out_check:
                 if (cow_start != (u64)-1) {
                         ret = cow_file_range(inode, locked_page,
                                              cow_start, found_key.offset - 1,
-                                            page_started, nr_written, 1);
+                                            end, page_started, nr_written, 1,
+                                            NULL);
                         if (ret) {
                                 if (!nolock && nocow)
                                         btrfs_end_write_no_snapshoting(root);
@@ -1501,8 +1505,8 @@ out_check:
         }
  
         if (cow_start != (u64)-1) {
-               ret = cow_file_range(inode, locked_page, cow_start, end,
-                                    page_started, nr_written, 1);
+               ret = cow_file_range(inode, locked_page, cow_start, end, end,
+                                    page_started, nr_written, 1, NULL);
                 if (ret)
                         goto error;
         }
@@ -1561,8 +1565,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
                 ret = run_delalloc_nocow(inode, locked_page, start, end,
                                          page_started, 0, nr_written);
         } else if (!inode_need_compress(inode)) {
-               ret = cow_file_range(inode, locked_page, start, end,
-                                     page_started, nr_written, 1);
+               ret = cow_file_range(inode, locked_page, start, end, end,
+                                     page_started, nr_written, 1, NULL);
         } else {
                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
                         &BTRFS_I(inode)->runtime_flags);
@@ -1740,7 +1744,7 @@ static void btrfs_set_bit_hook(struct inode *inode,
                 }
  
                 /* For sanity tests */
-               if (btrfs_test_is_dummy_root(root))
+               if (btrfs_is_testing(root->fs_info))
                         return;
  
                 __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
@@ -1799,7 +1803,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                         btrfs_delalloc_release_metadata(inode, len);
  
                 /* For sanity tests. */
-               if (btrfs_test_is_dummy_root(root))
+               if (btrfs_is_testing(root->fs_info))
                         return;
  
                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
@@ -1822,6 +1826,10 @@ static void btrfs_clear_bit_hook(struct inode *inode,
  /*
   * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
   * we don't create bios that span stripes or chunks
+ *
+ * return 1 if page cannot be merged to bio
+ * return 0 if page can be merged to bio
+ * return error otherwise
   */
  int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
                          size_t size, struct bio *bio,
@@ -1840,8 +1848,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
         map_length = length;
         ret = btrfs_map_block(root->fs_info, bio_op(bio), logical,
                               &map_length, NULL, 0);
-       /* Will always return 0 with map_multi == NULL */
-       BUG_ON(ret < 0);
+       if (ret < 0)
+               return ret;
         if (map_length < length + size)
                 return 1;
         return 0;
@@ -2594,7 +2602,7 @@ again:
         ret = btrfs_insert_empty_item(trans, root, path, &key,
                                         sizeof(*extent));
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_free_path;
         }
  
@@ -2621,7 +2629,7 @@ again:
                         backref->root_id, backref->inum,
                         new->file_pos); /* start - extent_offset */
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_free_path;
         }
  
@@ -2890,7 +2898,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                 ret = btrfs_update_inode_fallback(trans, root, inode);
                 if (ret) /* -ENOMEM or corruption */
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                 goto out;
         }
  
@@ -2950,7 +2958,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                            ordered_extent->file_offset, ordered_extent->len,
                            trans->transid);
         if (ret < 0) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_unlock;
         }
  
@@ -2960,7 +2968,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
         ret = btrfs_update_inode_fallback(trans, root, inode);
         if (ret) { /* -ENOMEM or corruption */
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_unlock;
         }
         ret = 0;
@@ -3204,7 +3212,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
                 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
                                             root->root_key.objectid);
                 if (ret)
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                 else
                         clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
                                   &root->state);
@@ -3295,7 +3303,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
                         if (ret != -EEXIST) {
                                 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
                                           &BTRFS_I(inode)->runtime_flags);
-                               btrfs_abort_transaction(trans, root, ret);
+                               btrfs_abort_transaction(trans, ret);
                                 return ret;
                         }
                 }
@@ -3307,7 +3315,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
                 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
                                                root->root_key.objectid);
                 if (ret && ret != -EEXIST) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         return ret;
                 }
         }
@@ -4006,20 +4014,20 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
                 btrfs_info(root->fs_info,
                         "failed to delete reference to %.*s, inode %llu parent %llu",
                         name_len, name, ino, dir_ino);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto err;
         }
  skip_backref:
         ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto err;
         }
  
         ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
                                          inode, dir_ino);
         if (ret != 0 && ret != -ENOENT) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto err;
         }
  
@@ -4028,7 +4036,7 @@ skip_backref:
         if (ret == -ENOENT)
                 ret = 0;
         else if (ret)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
  err:
         btrfs_free_path(path);
         if (ret)
@@ -4142,7 +4150,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
         WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
         ret = btrfs_delete_one_dir_name(trans, root, path, di);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out;
         }
         btrfs_release_path(path);
@@ -4152,7 +4160,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
                                  dir_ino, &index, name, name_len);
         if (ret < 0) {
                 if (ret != -ENOENT) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
                 di = btrfs_search_dir_index_item(root, path, dir_ino,
@@ -4162,7 +4170,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
                                 ret = -ENOENT;
                         else
                                 ret = PTR_ERR(di);
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
  
@@ -4175,7 +4183,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
  
         ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out;
         }
  
@@ -4184,7 +4192,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
         dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
         ret = btrfs_update_inode_fallback(trans, root, dir);
         if (ret)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
  out:
         btrfs_free_path(path);
         return ret;
@@ -4505,7 +4513,6 @@ search_again:
                                                               pending_del_nr);
                                         if (err) {
                                                 btrfs_abort_transaction(trans,
-                                                                       root,
                                                                         err);
                                                 goto error;
                                         }
@@ -4517,8 +4524,7 @@ search_again:
                                                              item_end,
                                                              new_size);
                                 if (err) {
-                                       btrfs_abort_transaction(trans,
-                                                               root, err);
+                                       btrfs_abort_transaction(trans, err);
                                         goto error;
                                 }
                         } else if (test_bit(BTRFS_ROOT_REF_COWS,
@@ -4582,8 +4588,7 @@ delete:
                                                 pending_del_slot,
                                                 pending_del_nr);
                                 if (ret) {
-                                       btrfs_abort_transaction(trans,
-                                                               root, ret);
+                                       btrfs_abort_transaction(trans, ret);
                                         goto error;
                                 }
                                 pending_del_nr = 0;
@@ -4616,7 +4621,7 @@ out:
                 ret = btrfs_del_items(trans, root, path, pending_del_slot,
                                       pending_del_nr);
                 if (ret)
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
         }
  error:
         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
@@ -4785,7 +4790,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
  
         ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 btrfs_end_transaction(trans, root);
                 return ret;
         }
@@ -4793,7 +4798,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
         ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
                                        0, 0, len, 0, len, 0, 0, 0);
         if (ret)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
         else
                 btrfs_update_inode(trans, root, inode);
         btrfs_end_transaction(trans, root);
@@ -5020,7 +5025,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                         i_size_write(inode, BTRFS_I(inode)->disk_i_size);
                         err = btrfs_orphan_del(trans, inode);
                         if (err)
-                               btrfs_abort_transaction(trans, root, err);
+                               btrfs_abort_transaction(trans, err);
                         btrfs_end_transaction(trans, root);
                 }
         }
@@ -5158,11 +5163,18 @@ void btrfs_evict_inode(struct inode *inode)
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_block_rsv *rsv, *global_rsv;
         int steal_from_global = 0;
-       u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+       u64 min_size;
         int ret;
  
         trace_btrfs_inode_evict(inode);
  
+       if (!root) {
+               kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+               return;
+       }
+
+       min_size = btrfs_calc_trunc_metadata_size(root, 1);
+
         evict_inode_truncate_pages(inode);
  
         if (inode->i_nlink &&
@@ -6239,9 +6251,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         btrfs_inherit_iflags(inode, dir);
  
         if (S_ISREG(mode)) {
-               if (btrfs_test_opt(root, NODATASUM))
+               if (btrfs_test_opt(root->fs_info, NODATASUM))
                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
-               if (btrfs_test_opt(root, NODATACOW))
+               if (btrfs_test_opt(root->fs_info, NODATACOW))
                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
                                 BTRFS_INODE_NODATASUM;
         }
@@ -6319,7 +6331,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
         if (ret == -EEXIST || ret == -EOVERFLOW)
                 goto fail_dir_item;
         else if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 return ret;
         }
  
@@ -6330,7 +6342,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
                 current_fs_time(parent_inode->i_sb);
         ret = btrfs_update_inode(trans, root, parent_inode);
         if (ret)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
         return ret;
  
  fail_dir_item:
@@ -9385,25 +9397,25 @@ int btrfs_init_cachep(void)
  
         btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
                         sizeof(struct btrfs_trans_handle), 0,
-                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+                       SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
         if (!btrfs_trans_handle_cachep)
                 goto fail;
  
         btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
                         sizeof(struct btrfs_transaction), 0,
-                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+                       SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
         if (!btrfs_transaction_cachep)
                 goto fail;
  
         btrfs_path_cachep = kmem_cache_create("btrfs_path",
                         sizeof(struct btrfs_path), 0,
-                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+                       SLAB_MEM_SPREAD, NULL);
         if (!btrfs_path_cachep)
                 goto fail;
  
         btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
                         sizeof(struct btrfs_free_space), 0,
-                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+                       SLAB_MEM_SPREAD, NULL);
         if (!btrfs_free_space_cachep)
                 goto fail;
  
@@ -9553,7 +9565,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                         ret = btrfs_update_inode(trans, root, old_inode);
         }
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_fail;
         }
  
@@ -9573,7 +9585,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                         ret = btrfs_update_inode(trans, dest, new_inode);
         }
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_fail;
         }
  
@@ -9581,7 +9593,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                              new_dentry->d_name.name,
                              new_dentry->d_name.len, 0, old_idx);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_fail;
         }
  
@@ -9589,7 +9601,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
                              old_dentry->d_name.name,
                              old_dentry->d_name.len, 0, new_idx);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_fail;
         }
  
@@ -9828,7 +9840,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                         ret = btrfs_update_inode(trans, root, old_inode);
         }
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_fail;
         }
  
@@ -9852,7 +9864,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 if (!ret && new_inode->i_nlink == 0)
                         ret = btrfs_orphan_add(trans, d_inode(new_dentry));
                 if (ret) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out_fail;
                 }
         }
@@ -9861,7 +9873,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                              new_dentry->d_name.name,
                              new_dentry->d_name.len, 0, index);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_fail;
         }
  
@@ -9881,7 +9893,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                                                 old_dentry);
  
                 if (ret) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out_fail;
                 }
         }
@@ -10307,7 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                 if (ret) {
                         btrfs_free_reserved_extent(root, ins.objectid,
                                                    ins.offset, 0);
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         if (own_trans)
                                 btrfs_end_transaction(trans, root);
                         break;
@@ -10367,7 +10379,7 @@ next:
                 ret = btrfs_update_inode(trans, root, inode);
  
                 if (ret) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         if (own_trans)
                                 btrfs_end_transaction(trans, root);
                         break;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 05173563e4a6b9068b6c02f62b4dfd400b325ca5..14ed1e9e6bc83df20da4863798d0052b376e5636 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -561,7 +561,7 @@ static noinline int create_subvol(struct inode *dir,
         new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
         if (IS_ERR(new_root)) {
                 ret = PTR_ERR(new_root);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
  
@@ -570,7 +570,7 @@ static noinline int create_subvol(struct inode *dir,
         ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
         if (ret) {
                 /* We potentially lose an unused inode item here */
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
  
@@ -583,7 +583,7 @@ static noinline int create_subvol(struct inode *dir,
          */
         ret = btrfs_set_inode_index(dir, &index);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
  
@@ -591,7 +591,7 @@ static noinline int create_subvol(struct inode *dir,
                                     name, namelen, dir, &key,
                                     BTRFS_FT_DIR, index);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
  
@@ -608,7 +608,7 @@ static noinline int create_subvol(struct inode *dir,
                                   root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
                                   objectid);
         if (ret)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
  
  fail:
         kfree(root_item);
@@ -1948,8 +1948,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
         return 1;
  }
  
-static noinline int copy_to_sk(struct btrfs_root *root,
-                              struct btrfs_path *path,
+static noinline int copy_to_sk(struct btrfs_path *path,
                                struct btrfs_key *key,
                                struct btrfs_ioctl_search_key *sk,
                                size_t *buf_size,
@@ -2120,7 +2119,7 @@ static noinline int search_ioctl(struct inode *inode,
                                 ret = 0;
                         goto err;
                 }
-               ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf,
+               ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
                                  &sk_offset, &num_found);
                 btrfs_release_path(path);
                 if (ret)
@@ -2406,7 +2405,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                  * rmdir(2).
                  */
                 err = -EPERM;
-               if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+               if (!btrfs_test_opt(root->fs_info, USER_SUBVOL_RM_ALLOWED))
                         goto out_dput;
  
                 /*
@@ -2489,7 +2488,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                                 dentry->d_name.len);
         if (ret) {
                 err = ret;
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_end_trans;
         }
  
@@ -2505,7 +2504,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                                         root->fs_info->tree_root,
                                         dest->root_key.objectid);
                 if (ret) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         err = ret;
                         goto out_end_trans;
                 }
@@ -2515,7 +2514,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                                   dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
                                   dest->root_key.objectid);
         if (ret && ret != -ENOENT) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 err = ret;
                 goto out_end_trans;
         }
@@ -2525,7 +2524,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                                           BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                           dest->root_key.objectid);
                 if (ret && ret != -ENOENT) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         err = ret;
                         goto out_end_trans;
                 }
@@ -3292,7 +3291,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
  
         ret = btrfs_update_inode(trans, root, inode);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 btrfs_end_transaction(trans, root);
                 goto out;
         }
@@ -3694,7 +3693,7 @@ process_slot:
                                 if (ret) {
                                         if (ret != -EOPNOTSUPP)
                                                 btrfs_abort_transaction(trans,
-                                                               root, ret);
+                                                                       ret);
                                         btrfs_end_transaction(trans, root);
                                         goto out;
                                 }
@@ -3702,8 +3701,7 @@ process_slot:
                                 ret = btrfs_insert_empty_item(trans, root, path,
                                                               &new_key, size);
                                 if (ret) {
-                                       btrfs_abort_transaction(trans, root,
-                                                               ret);
+                                       btrfs_abort_transaction(trans, ret);
                                         btrfs_end_transaction(trans, root);
                                         goto out;
                                 }
@@ -3735,7 +3733,6 @@ process_slot:
                                                         new_key.offset - datao);
                                         if (ret) {
                                                 btrfs_abort_transaction(trans,
-                                                                       root,
                                                                         ret);
                                                 btrfs_end_transaction(trans,
                                                                       root);
@@ -3772,7 +3769,6 @@ process_slot:
                                 if (ret) {
                                         if (ret != -EOPNOTSUPP)
                                                 btrfs_abort_transaction(trans,
-                                                                       root,
                                                                         ret);
                                         btrfs_end_transaction(trans, root);
                                         goto out;
@@ -3828,7 +3824,7 @@ process_slot:
                                          last_dest_end, destoff + len, 1);
                 if (ret) {
                         if (ret != -EOPNOTSUPP)
-                               btrfs_abort_transaction(trans, root, ret);
+                               btrfs_abort_transaction(trans, ret);
                         btrfs_end_transaction(trans, root);
                         goto out;
                 }
@@ -5164,13 +5160,13 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
                                           BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                           root->root_key.objectid);
                 if (ret < 0 && ret != -EEXIST) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
         }
         ret = btrfs_commit_transaction(trans, root);
         if (ret < 0) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out;
         }
  
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c

index aca8264f4a49d029c52ca9363cf4b64dcef1aef2..3b78d38173b3fa7eb0ddd5c07597c49e454bc954 100644 (file)
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -1122,7 +1122,7 @@ int __init ordered_data_init(void)
  {
         btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
                                      sizeof(struct btrfs_ordered_extent), 0,
-                                    SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                    SLAB_MEM_SPREAD,
                                      NULL);
         if (!btrfs_ordered_extent_cache)
                 return -ENOMEM;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c

index 36992128c7466cab65b1d8a9b71e670f32df9ac5..cf0b444ac4f306068c2da63349b3a151066118a1 100644 (file)
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -350,6 +350,7 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                struct btrfs_root *parent_root)
  {
+       struct super_block *sb = root->fs_info->sb;
         struct btrfs_key key;
         struct inode *parent_inode, *child_inode;
         int ret;
@@ -358,12 +359,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
         key.type = BTRFS_INODE_ITEM_KEY;
         key.offset = 0;
  
-       parent_inode = btrfs_iget(parent_root->fs_info->sb, &key,
-                                 parent_root, NULL);
+       parent_inode = btrfs_iget(sb, &key, parent_root, NULL);
         if (IS_ERR(parent_inode))
                 return PTR_ERR(parent_inode);
  
-       child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+       child_inode = btrfs_iget(sb, &key, root, NULL);
         if (IS_ERR(child_inode)) {
                 iput(parent_inode);
                 return PTR_ERR(child_inode);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c

index 9d4c05b14f6e742c5dc6d4d990ef8de269496865..93ee1c18ef9d4c2e1677fc3ad437b3044c84f68e 100644 (file)
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -571,7 +571,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
         struct extent_buffer *leaf;
         struct btrfs_key key;
  
-       if (btrfs_test_is_dummy_root(quota_root))
+       if (btrfs_is_testing(quota_root->fs_info))
                 return 0;
  
         path = btrfs_alloc_path();
@@ -728,7 +728,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
         int ret;
         int slot;
  
-       if (btrfs_test_is_dummy_root(root))
+       if (btrfs_is_testing(root->fs_info))
                 return 0;
  
         key.objectid = 0;
@@ -1453,9 +1453,10 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
         return ret;
  }
  
-struct btrfs_qgroup_extent_record
-*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
-                                 struct btrfs_qgroup_extent_record *record)
+struct btrfs_qgroup_extent_record *
+btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
+                                struct btrfs_delayed_ref_root *delayed_refs,
+                                struct btrfs_qgroup_extent_record *record)
  {
         struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
         struct rb_node *parent_node = NULL;
@@ -1463,7 +1464,7 @@ struct btrfs_qgroup_extent_record
         u64 bytenr = record->bytenr;
  
         assert_spin_locked(&delayed_refs->lock);
-       trace_btrfs_qgroup_insert_dirty_extent(record);
+       trace_btrfs_qgroup_insert_dirty_extent(fs_info, record);
  
         while (*p) {
                 parent_node = *p;
@@ -1595,8 +1596,8 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
                 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
                 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
  
-               trace_qgroup_update_counters(qg->qgroupid, cur_old_count,
-                                            cur_new_count);
+               trace_qgroup_update_counters(fs_info, qg->qgroupid,
+                                            cur_old_count, cur_new_count);
  
                 /* Rfer update part */
                 if (cur_old_count == 0 && cur_new_count > 0) {
@@ -1687,8 +1688,8 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
                 goto out_free;
         BUG_ON(!fs_info->quota_root);
  
-       trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots,
-                                         nr_new_roots);
+       trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes,
+                                         nr_old_roots, nr_new_roots);
  
         qgroups = ulist_alloc(GFP_NOFS);
         if (!qgroups) {
@@ -1759,7 +1760,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
                 record = rb_entry(node, struct btrfs_qgroup_extent_record,
                                   node);
  
-               trace_btrfs_qgroup_account_extents(record);
+               trace_btrfs_qgroup_account_extents(fs_info, record);
  
                 if (!ret) {
                         /*
@@ -2195,7 +2196,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
  {
         if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
                 return;
-       btrfs_err(trans->root->fs_info,
+       btrfs_err(trans->fs_info,
                 "qgroups not uptodate in trans handle %p:  list is%s empty, "
                 "seq is %#x.%x",
                 trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h

index ecb2c143ef756bd0356e3968b3f9f13fe5ac21cd..710887c06aaf4c6171a74b2b954653dfa61478e1 100644 (file)
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -63,9 +63,10 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
  struct btrfs_delayed_extent_op;
  int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
                                          struct btrfs_fs_info *fs_info);
-struct btrfs_qgroup_extent_record
-*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
-                                 struct btrfs_qgroup_extent_record *record);
+struct btrfs_qgroup_extent_record *
+btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
+                                struct btrfs_delayed_ref_root *delayed_refs,
+                                struct btrfs_qgroup_extent_record *record);
  int
  btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info,
@@ -88,7 +89,7 @@ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
                                                  u64 ref_root, u64 num_bytes)
  {
         btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
-       trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes);
+       trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
  }
  void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
  
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c

index fc067b07e31fac354c45a0f0ad1b693886b80a47..b26a5aea41b4a67be495fb419080e3dd9d235136 100644 (file)
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -235,12 +235,12 @@ static void backref_cache_cleanup(struct backref_cache *cache)
         cache->last_trans = 0;
  
         for (i = 0; i < BTRFS_MAX_LEVEL; i++)
-               BUG_ON(!list_empty(&cache->pending[i]));
-       BUG_ON(!list_empty(&cache->changed));
-       BUG_ON(!list_empty(&cache->detached));
-       BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
-       BUG_ON(cache->nr_nodes);
-       BUG_ON(cache->nr_edges);
+               ASSERT(list_empty(&cache->pending[i]));
+       ASSERT(list_empty(&cache->changed));
+       ASSERT(list_empty(&cache->detached));
+       ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
+       ASSERT(!cache->nr_nodes);
+       ASSERT(!cache->nr_edges);
  }
  
  static struct backref_node *alloc_backref_node(struct backref_cache *cache)
@@ -1171,8 +1171,12 @@ out:
                         lower = list_entry(useless.next,
                                            struct backref_node, list);
                         list_del_init(&lower->list);
+                       if (lower == node)
+                               node = NULL;
                         free_backref_node(cache, lower);
                 }
+
+               free_backref_node(cache, node);
                 return ERR_PTR(err);
         }
         ASSERT(!node || !node->detached);
@@ -1719,7 +1723,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
                                            btrfs_header_owner(leaf),
                                            key.objectid, key.offset);
                 if (ret) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         break;
                 }
  
@@ -1727,7 +1731,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
                                         parent, btrfs_header_owner(leaf),
                                         key.objectid, key.offset);
                 if (ret) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         break;
                 }
         }
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c

index f1c30861d062713e8feaaf5886c7bbca2596aa88..7fd7e1830cfe676e74000b54de026a5a1070cc1a 100644 (file)
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -150,7 +150,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
  
         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
         if (ret < 0) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out;
         }
  
@@ -176,20 +176,20 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
                 ret = btrfs_search_slot(trans, root, key, path,
                                 -1, 1);
                 if (ret < 0) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
  
                 ret = btrfs_del_item(trans, root, path);
                 if (ret < 0) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
                 btrfs_release_path(path);
                 ret = btrfs_insert_empty_item(trans, root, path,
                                 key, sizeof(*item));
                 if (ret < 0) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
                 l = path->nodes[0];
@@ -448,7 +448,7 @@ again:
         ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
                                       sizeof(*ref) + name_len);
         if (ret) {
-               btrfs_abort_transaction(trans, tree_root, ret);
+               btrfs_abort_transaction(trans, ret);
                 btrfs_free_path(path);
                 return ret;
         }
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c

index e08b6bc676e3faa4d93c211b11a59a90c1a62891..1d195d2b32c6ee62d497539172c15e011c26e467 100644 (file)
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3785,27 +3785,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
         if (fs_info->scrub_workers_refcnt == 0) {
                 if (is_dev_replace)
                         fs_info->scrub_workers =
-                               btrfs_alloc_workqueue("scrub", flags,
+                               btrfs_alloc_workqueue(fs_info, "scrub", flags,
                                                       1, 4);
                 else
                         fs_info->scrub_workers =
-                               btrfs_alloc_workqueue("scrub", flags,
+                               btrfs_alloc_workqueue(fs_info, "scrub", flags,
                                                       max_active, 4);
                 if (!fs_info->scrub_workers)
                         goto fail_scrub_workers;
  
                 fs_info->scrub_wr_completion_workers =
-                       btrfs_alloc_workqueue("scrubwrc", flags,
+                       btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
                                               max_active, 2);
                 if (!fs_info->scrub_wr_completion_workers)
                         goto fail_scrub_wr_completion_workers;
  
                 fs_info->scrub_nocow_workers =
-                       btrfs_alloc_workqueue("scrubnc", flags, 1, 0);
+                       btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0);
                 if (!fs_info->scrub_nocow_workers)
                         goto fail_scrub_nocow_workers;
                 fs_info->scrub_parity_workers =
-                       btrfs_alloc_workqueue("scrubparity", flags,
+                       btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
                                               max_active, 2);
                 if (!fs_info->scrub_parity_workers)
                         goto fail_scrub_parity_workers;
@@ -3860,7 +3860,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
  
         if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
                 /* not supported for data w/o checksums */
-               btrfs_err(fs_info,
+               btrfs_err_rl(fs_info,
                            "scrub: size assumption sectorsize != PAGE_SIZE "
                            "(%d != %lu) fails",
                        fs_info->chunk_root->sectorsize, PAGE_SIZE);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index 60e7179ed4b77ff8dcd9de1d8eb1c0cebce9f95d..864ce334f696c31badf2919b07d503416545ded6 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -184,6 +184,22 @@ static const char * const logtypes[] = {
         "debug",
  };
  
+
+/*
+ * Use one ratelimit state per log level so that a flood of less important
+ * messages doesn't cause more important ones to be dropped.
+ */
+static struct ratelimit_state printk_limits[] = {
+       RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
+       RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
+       RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
+       RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
+       RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
+       RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
+       RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
+       RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
+};
+
  void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
  {
         struct super_block *sb = fs_info->sb;
@@ -192,6 +208,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
         va_list args;
         const char *type = logtypes[4];
         int kern_level;
+       struct ratelimit_state *ratelimit;
  
         va_start(args, fmt);
  
@@ -202,13 +219,18 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
                 lvl[size] = '\0';
                 fmt += size;
                 type = logtypes[kern_level - '0'];
-       } else
+               ratelimit = &printk_limits[kern_level - '0'];
+       } else {
                 *lvl = '\0';
+               /* Default to debug output */
+               ratelimit = &printk_limits[7];
+       }
  
         vaf.fmt = fmt;
         vaf.va = &args;
  
-       printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
+       if (__ratelimit(ratelimit))
+               printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
  
         va_end(args);
  }
@@ -229,9 +251,11 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
   */
  __cold
  void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root, const char *function,
+                              const char *function,
                                unsigned int line, int errno)
  {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+
         trans->aborted = errno;
         /* Nothing used. The other threads that have joined this
          * transaction may be able to continue. */
@@ -239,16 +263,16 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                 const char *errstr;
  
                 errstr = btrfs_decode_error(errno);
-               btrfs_warn(root->fs_info,
+               btrfs_warn(fs_info,
                            "%s:%d: Aborting unused transaction(%s).",
                            function, line, errstr);
                 return;
         }
         ACCESS_ONCE(trans->transaction->aborted) = errno;
         /* Wake up anybody who may be waiting on this transaction */
-       wake_up(&root->fs_info->transaction_wait);
-       wake_up(&root->fs_info->transaction_blocked_wait);
-       __btrfs_handle_fs_error(root->fs_info, function, line, errno, NULL);
+       wake_up(&fs_info->transaction_wait);
+       wake_up(&fs_info->transaction_blocked_wait);
+       __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
  }
  /*
   * __btrfs_panic decodes unexpected, fatal errors from the caller,
@@ -432,12 +456,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
                          */
                         break;
                 case Opt_nodatasum:
-                       btrfs_set_and_info(root, NODATASUM,
+                       btrfs_set_and_info(info, NODATASUM,
                                            "setting nodatasum");
                         break;
                 case Opt_datasum:
-                       if (btrfs_test_opt(root, NODATASUM)) {
-                               if (btrfs_test_opt(root, NODATACOW))
+                       if (btrfs_test_opt(info, NODATASUM)) {
+                               if (btrfs_test_opt(info, NODATACOW))
                                         btrfs_info(root->fs_info, "setting datasum, datacow enabled");
                                 else
                                         btrfs_info(root->fs_info, "setting datasum");
@@ -446,9 +470,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
                         btrfs_clear_opt(info->mount_opt, NODATASUM);
                         break;
                 case Opt_nodatacow:
-                       if (!btrfs_test_opt(root, NODATACOW)) {
-                               if (!btrfs_test_opt(root, COMPRESS) ||
-                                   !btrfs_test_opt(root, FORCE_COMPRESS)) {
+                       if (!btrfs_test_opt(info, NODATACOW)) {
+                               if (!btrfs_test_opt(info, COMPRESS) ||
+                                   !btrfs_test_opt(info, FORCE_COMPRESS)) {
                                         btrfs_info(root->fs_info,
                                                    "setting nodatacow, compression disabled");
                                 } else {
@@ -461,7 +485,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
                         btrfs_set_opt(info->mount_opt, NODATASUM);
                         break;
                 case Opt_datacow:
-                       btrfs_clear_and_info(root, NODATACOW,
+                       btrfs_clear_and_info(info, NODATACOW,
                                              "setting datacow");
                         break;
                 case Opt_compress_force:
@@ -470,10 +494,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
                         /* Fallthrough */
                 case Opt_compress:
                 case Opt_compress_type:
-                       saved_compress_type = btrfs_test_opt(root, COMPRESS) ?
+                       saved_compress_type = btrfs_test_opt(info,
+                                                            COMPRESS) ?
                                 info->compress_type : BTRFS_COMPRESS_NONE;
                         saved_compress_force =
-                               btrfs_test_opt(root, FORCE_COMPRESS);
+                               btrfs_test_opt(info, FORCE_COMPRESS);
                         if (token == Opt_compress ||
                             token == Opt_compress_force ||
                             strcmp(args[0].from, "zlib") == 0) {
@@ -513,10 +538,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
                                  */
                                 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
                         }
-                       if ((btrfs_test_opt(root, COMPRESS) &&
+                       if ((btrfs_test_opt(info, COMPRESS) &&
                              (info->compress_type != saved_compress_type ||
                               compress_force != saved_compress_force)) ||
-                           (!btrfs_test_opt(root, COMPRESS) &&
+                           (!btrfs_test_opt(info, COMPRESS) &&
                              no_compress == 1)) {
                                 btrfs_info(root->fs_info,
                                            "%s %s compression",
@@ -526,25 +551,25 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
                         compress_force = false;
                         break;
                 case Opt_ssd:
-                       btrfs_set_and_info(root, SSD,
+                       btrfs_set_and_info(info, SSD,
                                            "use ssd allocation scheme");
                         break;
                 case Opt_ssd_spread:
-                       btrfs_set_and_info(root, SSD_SPREAD,
+                       btrfs_set_and_info(info, SSD_SPREAD,
                                            "use spread ssd allocation scheme");
                         btrfs_set_opt(info->mount_opt, SSD);
                         break;
                 case Opt_nossd:
-                       btrfs_set_and_info(root, NOSSD,
+                       btrfs_set_and_info(info, NOSSD,
                                              "not using ssd allocation scheme");
                         btrfs_clear_opt(info->mount_opt, SSD);
                         break;
                 case Opt_barrier:
-                       btrfs_clear_and_info(root, NOBARRIER,
+                       btrfs_clear_and_info(info, NOBARRIER,
                                              "turning on barriers");
                         break;
                 case Opt_nobarrier:
-                       btrfs_set_and_info(root, NOBARRIER,
+                       btrfs_set_and_info(info, NOBARRIER,
                                            "turning off barriers");
                         break;
                 case Opt_thread_pool:
@@ -604,24 +629,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
                         root->fs_info->sb->s_flags &= ~MS_POSIXACL;
                         break;
                 case Opt_notreelog:
-                       btrfs_set_and_info(root, NOTREELOG,
+                       btrfs_set_and_info(info, NOTREELOG,
                                            "disabling tree log");
                         break;
                 case Opt_treelog:
-                       btrfs_clear_and_info(root, NOTREELOG,
+                       btrfs_clear_and_info(info, NOTREELOG,
                                              "enabling tree log");
                         break;
                 case Opt_norecovery:
                 case Opt_nologreplay:
-                       btrfs_set_and_info(root, NOLOGREPLAY,
+                       btrfs_set_and_info(info, NOLOGREPLAY,
                                            "disabling log replay at mount time");
                         break;
                 case Opt_flushoncommit:
-                       btrfs_set_and_info(root, FLUSHONCOMMIT,
+                       btrfs_set_and_info(info, FLUSHONCOMMIT,
                                            "turning on flush-on-commit");
                         break;
                 case Opt_noflushoncommit:
-                       btrfs_clear_and_info(root, FLUSHONCOMMIT,
+                       btrfs_clear_and_info(info, FLUSHONCOMMIT,
                                              "turning off flush-on-commit");
                         break;
                 case Opt_ratio:
@@ -638,11 +663,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
                         }
                         break;
                 case Opt_discard:
-                       btrfs_set_and_info(root, DISCARD,
+                       btrfs_set_and_info(info, DISCARD,
                                            "turning on discard");
                         break;
                 case Opt_nodiscard:
-                       btrfs_clear_and_info(root, DISCARD,
+                       btrfs_clear_and_info(info, DISCARD,
                                              "turning off discard");
                         break;
                 case Opt_space_cache:
@@ -651,12 +676,13 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
                             strcmp(args[0].from, "v1") == 0) {
                                 btrfs_clear_opt(root->fs_info->mount_opt,
                                                 FREE_SPACE_TREE);
-                               btrfs_set_and_info(root, SPACE_CACHE,
+                               btrfs_set_and_info(info, SPACE_CACHE,
                                                    "enabling disk space caching");
                         } else if (strcmp(args[0].from, "v2") == 0) {
                                 btrfs_clear_opt(root->fs_info->mount_opt,
                                                 SPACE_CACHE);
-                               btrfs_set_and_info(root, FREE_SPACE_TREE,
+                               btrfs_set_and_info(info,
+                                                  FREE_SPACE_TREE,
                                                    "enabling free space tree");
                         } else {
                                 ret = -EINVAL;
@@ -667,12 +693,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
                         btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
                         break;
                 case Opt_no_space_cache:
-                       if (btrfs_test_opt(root, SPACE_CACHE)) {
-                               btrfs_clear_and_info(root, SPACE_CACHE,
+                       if (btrfs_test_opt(info, SPACE_CACHE)) {
+                               btrfs_clear_and_info(info,
+                                                    SPACE_CACHE,
                                                      "disabling disk space caching");
                         }
-                       if (btrfs_test_opt(root, FREE_SPACE_TREE)) {
-                               btrfs_clear_and_info(root, FREE_SPACE_TREE,
+                       if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
+                               btrfs_clear_and_info(info,
+                                                    FREE_SPACE_TREE,
                                                      "disabling free space tree");
                         }
                         break;
@@ -685,7 +713,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
                                              "disabling inode map caching");
                         break;
                 case Opt_clear_cache:
-                       btrfs_set_and_info(root, CLEAR_CACHE,
+                       btrfs_set_and_info(info, CLEAR_CACHE,
                                            "force clearing of disk cache");
                         break;
                 case Opt_user_subvol_rm_allowed:
@@ -698,11 +726,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
                         btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
                         break;
                 case Opt_defrag:
-                       btrfs_set_and_info(root, AUTO_DEFRAG,
+                       btrfs_set_and_info(info, AUTO_DEFRAG,
                                            "enabling auto defrag");
                         break;
                 case Opt_nodefrag:
-                       btrfs_clear_and_info(root, AUTO_DEFRAG,
+                       btrfs_clear_and_info(info, AUTO_DEFRAG,
                                              "disabling auto defrag");
                         break;
                 case Opt_recovery:
@@ -810,22 +838,22 @@ check:
         /*
          * Extra check for current option against current flag
          */
-       if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
+       if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
                 btrfs_err(root->fs_info,
                           "nologreplay must be used with ro mount option");
                 ret = -EINVAL;
         }
  out:
         if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
-           !btrfs_test_opt(root, FREE_SPACE_TREE) &&
-           !btrfs_test_opt(root, CLEAR_CACHE)) {
+           !btrfs_test_opt(info, FREE_SPACE_TREE) &&
+           !btrfs_test_opt(info, CLEAR_CACHE)) {
                 btrfs_err(root->fs_info, "cannot disable free space tree");
                 ret = -EINVAL;
  
         }
-       if (!ret && btrfs_test_opt(root, SPACE_CACHE))
+       if (!ret && btrfs_test_opt(info, SPACE_CACHE))
                 btrfs_info(root->fs_info, "disk space caching is enabled");
-       if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE))
+       if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
                 btrfs_info(root->fs_info, "using free space tree");
         kfree(orig);
         return ret;
@@ -1149,7 +1177,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
         struct btrfs_fs_info *fs_info = btrfs_sb(sb);
         struct btrfs_root *root = fs_info->tree_root;
  
-       trace_btrfs_sync_fs(wait);
+       trace_btrfs_sync_fs(fs_info, wait);
  
         if (!wait) {
                 filemap_flush(fs_info->btree_inode->i_mapping);
@@ -1192,13 +1220,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
         struct btrfs_root *root = info->tree_root;
         char *compress_type;
  
-       if (btrfs_test_opt(root, DEGRADED))
+       if (btrfs_test_opt(info, DEGRADED))
                 seq_puts(seq, ",degraded");
-       if (btrfs_test_opt(root, NODATASUM))
+       if (btrfs_test_opt(info, NODATASUM))
                 seq_puts(seq, ",nodatasum");
-       if (btrfs_test_opt(root, NODATACOW))
+       if (btrfs_test_opt(info, NODATACOW))
                 seq_puts(seq, ",nodatacow");
-       if (btrfs_test_opt(root, NOBARRIER))
+       if (btrfs_test_opt(info, NOBARRIER))
                 seq_puts(seq, ",nobarrier");
         if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
                 seq_printf(seq, ",max_inline=%llu", info->max_inline);
@@ -1207,56 +1235,56 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
         if (info->thread_pool_size !=  min_t(unsigned long,
                                              num_online_cpus() + 2, 8))
                 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
-       if (btrfs_test_opt(root, COMPRESS)) {
+       if (btrfs_test_opt(info, COMPRESS)) {
                 if (info->compress_type == BTRFS_COMPRESS_ZLIB)
                         compress_type = "zlib";
                 else
                         compress_type = "lzo";
-               if (btrfs_test_opt(root, FORCE_COMPRESS))
+               if (btrfs_test_opt(info, FORCE_COMPRESS))
                         seq_printf(seq, ",compress-force=%s", compress_type);
                 else
                         seq_printf(seq, ",compress=%s", compress_type);
         }
-       if (btrfs_test_opt(root, NOSSD))
+       if (btrfs_test_opt(info, NOSSD))
                 seq_puts(seq, ",nossd");
-       if (btrfs_test_opt(root, SSD_SPREAD))
+       if (btrfs_test_opt(info, SSD_SPREAD))
                 seq_puts(seq, ",ssd_spread");
-       else if (btrfs_test_opt(root, SSD))
+       else if (btrfs_test_opt(info, SSD))
                 seq_puts(seq, ",ssd");
-       if (btrfs_test_opt(root, NOTREELOG))
+       if (btrfs_test_opt(info, NOTREELOG))
                 seq_puts(seq, ",notreelog");
-       if (btrfs_test_opt(root, NOLOGREPLAY))
+       if (btrfs_test_opt(info, NOLOGREPLAY))
                 seq_puts(seq, ",nologreplay");
-       if (btrfs_test_opt(root, FLUSHONCOMMIT))
+       if (btrfs_test_opt(info, FLUSHONCOMMIT))
                 seq_puts(seq, ",flushoncommit");
-       if (btrfs_test_opt(root, DISCARD))
+       if (btrfs_test_opt(info, DISCARD))
                 seq_puts(seq, ",discard");
         if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
                 seq_puts(seq, ",noacl");
-       if (btrfs_test_opt(root, SPACE_CACHE))
+       if (btrfs_test_opt(info, SPACE_CACHE))
                 seq_puts(seq, ",space_cache");
-       else if (btrfs_test_opt(root, FREE_SPACE_TREE))
+       else if (btrfs_test_opt(info, FREE_SPACE_TREE))
                 seq_puts(seq, ",space_cache=v2");
         else
                 seq_puts(seq, ",nospace_cache");
-       if (btrfs_test_opt(root, RESCAN_UUID_TREE))
+       if (btrfs_test_opt(info, RESCAN_UUID_TREE))
                 seq_puts(seq, ",rescan_uuid_tree");
-       if (btrfs_test_opt(root, CLEAR_CACHE))
+       if (btrfs_test_opt(info, CLEAR_CACHE))
                 seq_puts(seq, ",clear_cache");
-       if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+       if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED))
                 seq_puts(seq, ",user_subvol_rm_allowed");
-       if (btrfs_test_opt(root, ENOSPC_DEBUG))
+       if (btrfs_test_opt(info, ENOSPC_DEBUG))
                 seq_puts(seq, ",enospc_debug");
-       if (btrfs_test_opt(root, AUTO_DEFRAG))
+       if (btrfs_test_opt(info, AUTO_DEFRAG))
                 seq_puts(seq, ",autodefrag");
-       if (btrfs_test_opt(root, INODE_MAP_CACHE))
+       if (btrfs_test_opt(info, INODE_MAP_CACHE))
                 seq_puts(seq, ",inode_cache");
-       if (btrfs_test_opt(root, SKIP_BALANCE))
+       if (btrfs_test_opt(info, SKIP_BALANCE))
                 seq_puts(seq, ",skip_balance");
  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-       if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
+       if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
                 seq_puts(seq, ",check_int_data");
-       else if (btrfs_test_opt(root, CHECK_INTEGRITY))
+       else if (btrfs_test_opt(info, CHECK_INTEGRITY))
                 seq_puts(seq, ",check_int");
         if (info->check_integrity_print_mask)
                 seq_printf(seq, ",check_int_print_mask=%d",
@@ -1265,14 +1293,14 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
         if (info->metadata_ratio)
                 seq_printf(seq, ",metadata_ratio=%d",
                                 info->metadata_ratio);
-       if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
+       if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
                 seq_puts(seq, ",fatal_errors=panic");
         if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
                 seq_printf(seq, ",commit=%d", info->commit_interval);
  #ifdef CONFIG_BTRFS_DEBUG
-       if (btrfs_test_opt(root, FRAGMENT_DATA))
+       if (btrfs_test_opt(info, FRAGMENT_DATA))
                 seq_puts(seq, ",fragment=data");
-       if (btrfs_test_opt(root, FRAGMENT_METADATA))
+       if (btrfs_test_opt(info, FRAGMENT_METADATA))
                 seq_puts(seq, ",fragment=metadata");
  #endif
         seq_printf(seq, ",subvolid=%llu",
@@ -2030,9 +2058,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
   * chunk).
   *
   * If metadata is exhausted, f_bavail will be 0.
- *
- * FIXME: not accurate for mixed block groups, total and free/used are ok,
- * available appears slightly larger.
   */
  static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
  {
@@ -2319,49 +2344,6 @@ static void btrfs_print_mod_info(void)
                         btrfs_crc32c_impl());
  }
  
-static int btrfs_run_sanity_tests(void)
-{
-       int ret, i;
-       u32 sectorsize, nodesize;
-       u32 test_sectorsize[] = {
-               PAGE_SIZE,
-       };
-       ret = btrfs_init_test_fs();
-       if (ret)
-               return ret;
-       for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) {
-               sectorsize = test_sectorsize[i];
-               for (nodesize = sectorsize;
-                    nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE;
-                    nodesize <<= 1) {
-                       pr_info("BTRFS: selftest: sectorsize: %u  nodesize: %u\n",
-                               sectorsize, nodesize);
-                       ret = btrfs_test_free_space_cache(sectorsize, nodesize);
-                       if (ret)
-                               goto out;
-                       ret = btrfs_test_extent_buffer_operations(sectorsize,
-                               nodesize);
-                       if (ret)
-                               goto out;
-                       ret = btrfs_test_extent_io(sectorsize, nodesize);
-                       if (ret)
-                               goto out;
-                       ret = btrfs_test_inodes(sectorsize, nodesize);
-                       if (ret)
-                               goto out;
-                       ret = btrfs_test_qgroups(sectorsize, nodesize);
-                       if (ret)
-                               goto out;
-                       ret = btrfs_test_free_space_tree(sectorsize, nodesize);
-                       if (ret)
-                               goto out;
-               }
-       }
-out:
-       btrfs_destroy_test_fs();
-       return ret;
-}
-
  static int __init init_btrfs_fs(void)
  {
         int err;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c

index 4879656bda3cdebefbcd6390bf6ab6597e616ac0..c6569905d3d1cc5058b7a7a67785a49d6e5d1104 100644 (file)
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -326,6 +326,7 @@ SPACE_INFO_ATTR(bytes_used);
  SPACE_INFO_ATTR(bytes_pinned);
  SPACE_INFO_ATTR(bytes_reserved);
  SPACE_INFO_ATTR(bytes_may_use);
+SPACE_INFO_ATTR(bytes_readonly);
  SPACE_INFO_ATTR(disk_used);
  SPACE_INFO_ATTR(disk_total);
  BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
@@ -337,6 +338,7 @@ static struct attribute *space_info_attrs[] = {
         BTRFS_ATTR_PTR(bytes_pinned),
         BTRFS_ATTR_PTR(bytes_reserved),
         BTRFS_ATTR_PTR(bytes_may_use),
+       BTRFS_ATTR_PTR(bytes_readonly),
         BTRFS_ATTR_PTR(disk_used),
         BTRFS_ATTR_PTR(disk_total),
         BTRFS_ATTR_PTR(total_bytes_pinned),
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c

index 02223f3f78f4b54bc1fca294d71ac75b0e61b2ce..bf62ad919a95db4725483000b6110b6e20890214 100644 (file)
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -54,7 +54,7 @@ struct inode *btrfs_new_test_inode(void)
         return new_inode(test_mnt->mnt_sb);
  }
  
-int btrfs_init_test_fs(void)
+static int btrfs_init_test_fs(void)
  {
         int ret;
  
@@ -73,7 +73,7 @@ int btrfs_init_test_fs(void)
         return 0;
  }
  
-void btrfs_destroy_test_fs(void)
+static void btrfs_destroy_test_fs(void)
  {
         kern_unmount(test_mnt);
         unregister_filesystem(&test_type);
@@ -128,14 +128,27 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
         extent_io_tree_init(&fs_info->freed_extents[0], NULL);
         extent_io_tree_init(&fs_info->freed_extents[1], NULL);
         fs_info->pinned_extents = &fs_info->freed_extents[0];
+       set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+
+       test_mnt->mnt_sb->s_fs_info = fs_info;
+
         return fs_info;
  }
  
-static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
+void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
  {
         struct radix_tree_iter iter;
         void **slot;
  
+       if (!fs_info)
+               return;
+
+       if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
+                             &fs_info->fs_state)))
+               return;
+
+       test_mnt->mnt_sb->s_fs_info = NULL;
+
         spin_lock(&fs_info->buffer_lock);
         radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
                 struct extent_buffer *eb;
@@ -167,10 +180,11 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
  {
         if (!root)
                 return;
+       /* Will be freed by btrfs_free_fs_roots */
+       if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
+               return;
         if (root->node)
                 free_extent_buffer(root->node);
-       if (root->fs_info)
-               btrfs_free_dummy_fs_info(root->fs_info);
         kfree(root);
  }
  
@@ -220,3 +234,46 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
         INIT_LIST_HEAD(&trans->qgroup_ref_list);
         trans->type = __TRANS_DUMMY;
  }
+
+int btrfs_run_sanity_tests(void)
+{
+       int ret, i;
+       u32 sectorsize, nodesize;
+       u32 test_sectorsize[] = {
+               PAGE_SIZE,
+       };
+       ret = btrfs_init_test_fs();
+       if (ret)
+               return ret;
+       for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) {
+               sectorsize = test_sectorsize[i];
+               for (nodesize = sectorsize;
+                    nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE;
+                    nodesize <<= 1) {
+                       pr_info("BTRFS: selftest: sectorsize: %u  nodesize: %u\n",
+                               sectorsize, nodesize);
+                       ret = btrfs_test_free_space_cache(sectorsize, nodesize);
+                       if (ret)
+                               goto out;
+                       ret = btrfs_test_extent_buffer_operations(sectorsize,
+                               nodesize);
+                       if (ret)
+                               goto out;
+                       ret = btrfs_test_extent_io(sectorsize, nodesize);
+                       if (ret)
+                               goto out;
+                       ret = btrfs_test_inodes(sectorsize, nodesize);
+                       if (ret)
+                               goto out;
+                       ret = btrfs_test_qgroups(sectorsize, nodesize);
+                       if (ret)
+                               goto out;
+                       ret = btrfs_test_free_space_tree(sectorsize, nodesize);
+                       if (ret)
+                               goto out;
+               }
+       }
+out:
+       btrfs_destroy_test_fs();
+       return ret;
+}
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h

index 66fb6b701eb72975fa203fd4018ae847cffd46ab..b17ffbe8f9f33651a319dd5574a40320ee476fed 100644 (file)
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -20,57 +20,29 @@
  #define __BTRFS_TESTS
  
  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_run_sanity_tests(void);
  
  #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
  
  struct btrfs_root;
  struct btrfs_trans_handle;
  
-int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize);
  int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize);
+int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize);
  int btrfs_test_extent_io(u32 sectorsize, u32 nodesize);
  int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
  int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
  int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
-int btrfs_init_test_fs(void);
-void btrfs_destroy_test_fs(void);
  struct inode *btrfs_new_test_inode(void);
  struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
+void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
  void btrfs_free_dummy_root(struct btrfs_root *root);
  struct btrfs_block_group_cache *
  btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize);
  void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
  void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans);
  #else
-static inline int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
-{
-       return 0;
-}
-static inline int btrfs_test_extent_buffer_operations(u32 sectorsize,
-       u32 nodesize)
-{
-       return 0;
-}
-static inline int btrfs_init_test_fs(void)
-{
-       return 0;
-}
-static inline void btrfs_destroy_test_fs(void)
-{
-}
-static inline int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
-{
-       return 0;
-}
-static inline int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
-{
-       return 0;
-}
-static inline int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
-{
-       return 0;
-}
-static inline int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize)
+static inline int btrfs_run_sanity_tests(void)
  {
         return 0;
  }
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c

index 4f8cbd1ec5ee340d862a8f8c415ce828ebd1a9c8..199569174637481fe2138dc20ea696388f2093f6 100644 (file)
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -24,8 +24,9 @@
  
  static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
  {
-       struct btrfs_path *path;
-       struct btrfs_root *root;
+       struct btrfs_fs_info *fs_info;
+       struct btrfs_path *path = NULL;
+       struct btrfs_root *root = NULL;
         struct extent_buffer *eb;
         struct btrfs_item *item;
         char *value = "mary had a little lamb";
@@ -40,17 +41,24 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
  
         test_msg("Running btrfs_split_item tests\n");
  
-       root = btrfs_alloc_dummy_root(sectorsize, nodesize);
+       fs_info = btrfs_alloc_dummy_fs_info();
+       if (!fs_info) {
+               test_msg("Could not allocate fs_info\n");
+               return -ENOMEM;
+       }
+
+       root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
         if (IS_ERR(root)) {
                 test_msg("Could not allocate root\n");
-               return PTR_ERR(root);
+               ret = PTR_ERR(root);
+               goto out;
         }
  
         path = btrfs_alloc_path();
         if (!path) {
                 test_msg("Could not allocate path\n");
-               kfree(root);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
         }
  
         path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize,
@@ -219,7 +227,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
         }
  out:
         btrfs_free_path(path);
-       kfree(root);
+       btrfs_free_dummy_root(root);
+       btrfs_free_dummy_fs_info(fs_info);
         return ret;
  }
  
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c

index 3956bb2ff84cab59bf25fc9869cebc83d8606621..3221c8dee272f2bebe8afb705c725e7a8ed9b25f 100644 (file)
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -837,6 +837,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
  
  int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
  {
+       struct btrfs_fs_info *fs_info;
         struct btrfs_block_group_cache *cache;
         struct btrfs_root *root = NULL;
         int ret = -ENOMEM;
@@ -855,15 +856,17 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
                 return 0;
         }
  
-       root = btrfs_alloc_dummy_root(sectorsize, nodesize);
-       if (IS_ERR(root)) {
-               ret = PTR_ERR(root);
+       fs_info = btrfs_alloc_dummy_fs_info();
+       if (!fs_info) {
+               ret = -ENOMEM;
                 goto out;
         }
  
-       root->fs_info = btrfs_alloc_dummy_fs_info();
-       if (!root->fs_info)
+       root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+       if (IS_ERR(root)) {
+               ret = PTR_ERR(root);
                 goto out;
+       }
  
         root->fs_info->extent_root = root;
         cache->fs_info = root->fs_info;
@@ -882,6 +885,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
  out:
         btrfs_free_dummy_block_group(cache);
         btrfs_free_dummy_root(root);
+       btrfs_free_dummy_fs_info(fs_info);
         test_msg("Free space cache tests finished\n");
         return ret;
  }
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c

index aac507085ab0958f89a1ce81f0dc69d3824c2a6c..7508d3b427804c0cb634b565e0b625e99a2c0dfc 100644 (file)
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -443,23 +443,24 @@ typedef int (*test_func_t)(struct btrfs_trans_handle *,
  static int run_test(test_func_t test_func, int bitmaps,
                 u32 sectorsize, u32 nodesize)
  {
+       struct btrfs_fs_info *fs_info;
         struct btrfs_root *root = NULL;
         struct btrfs_block_group_cache *cache = NULL;
         struct btrfs_trans_handle trans;
         struct btrfs_path *path = NULL;
         int ret;
  
-       root = btrfs_alloc_dummy_root(sectorsize, nodesize);
-       if (IS_ERR(root)) {
-               test_msg("Couldn't allocate dummy root\n");
-               ret = PTR_ERR(root);
+       fs_info = btrfs_alloc_dummy_fs_info();
+       if (!fs_info) {
+               test_msg("Couldn't allocate dummy fs info\n");
+               ret = -ENOMEM;
                 goto out;
         }
  
-       root->fs_info = btrfs_alloc_dummy_fs_info();
-       if (!root->fs_info) {
-               test_msg("Couldn't allocate dummy fs info\n");
-               ret = -ENOMEM;
+       root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+       if (IS_ERR(root)) {
+               test_msg("Couldn't allocate dummy root\n");
+               ret = PTR_ERR(root);
                 goto out;
         }
  
@@ -534,6 +535,7 @@ out:
         btrfs_free_path(path);
         btrfs_free_dummy_block_group(cache);
         btrfs_free_dummy_root(root);
+       btrfs_free_dummy_fs_info(fs_info);
         return ret;
  }
  
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c

index 29648c0a39f1bc380eaffb08f0d574098dc9d343..9f72aeda922041f9bec7de131be16817f38abc19 100644 (file)
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -230,6 +230,7 @@ static unsigned long vacancy_only = 0;
  
  static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
  {
+       struct btrfs_fs_info *fs_info = NULL;
         struct inode *inode = NULL;
         struct btrfs_root *root = NULL;
         struct extent_map *em = NULL;
@@ -248,19 +249,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
         BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
         BTRFS_I(inode)->location.offset = 0;
  
-       root = btrfs_alloc_dummy_root(sectorsize, nodesize);
-       if (IS_ERR(root)) {
-               test_msg("Couldn't allocate root\n");
+       fs_info = btrfs_alloc_dummy_fs_info();
+       if (!fs_info) {
+               test_msg("Couldn't allocate dummy fs info\n");
                 goto out;
         }
  
-       /*
-        * We do this since btrfs_get_extent wants to assign em->bdev to
-        * root->fs_info->fs_devices->latest_bdev.
-        */
-       root->fs_info = btrfs_alloc_dummy_fs_info();
-       if (!root->fs_info) {
-               test_msg("Couldn't allocate dummy fs info\n");
+       root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+       if (IS_ERR(root)) {
+               test_msg("Couldn't allocate root\n");
                 goto out;
         }
  
@@ -835,11 +832,13 @@ out:
                 free_extent_map(em);
         iput(inode);
         btrfs_free_dummy_root(root);
+       btrfs_free_dummy_fs_info(fs_info);
         return ret;
  }
  
  static int test_hole_first(u32 sectorsize, u32 nodesize)
  {
+       struct btrfs_fs_info *fs_info = NULL;
         struct inode *inode = NULL;
         struct btrfs_root *root = NULL;
         struct extent_map *em = NULL;
@@ -855,15 +854,15 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
         BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
         BTRFS_I(inode)->location.offset = 0;
  
-       root = btrfs_alloc_dummy_root(sectorsize, nodesize);
-       if (IS_ERR(root)) {
-               test_msg("Couldn't allocate root\n");
+       fs_info = btrfs_alloc_dummy_fs_info();
+       if (!fs_info) {
+               test_msg("Couldn't allocate dummy fs info\n");
                 goto out;
         }
  
-       root->fs_info = btrfs_alloc_dummy_fs_info();
-       if (!root->fs_info) {
-               test_msg("Couldn't allocate dummy fs info\n");
+       root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+       if (IS_ERR(root)) {
+               test_msg("Couldn't allocate root\n");
                 goto out;
         }
  
@@ -934,11 +933,13 @@ out:
                 free_extent_map(em);
         iput(inode);
         btrfs_free_dummy_root(root);
+       btrfs_free_dummy_fs_info(fs_info);
         return ret;
  }
  
  static int test_extent_accounting(u32 sectorsize, u32 nodesize)
  {
+       struct btrfs_fs_info *fs_info = NULL;
         struct inode *inode = NULL;
         struct btrfs_root *root = NULL;
         int ret = -ENOMEM;
@@ -949,15 +950,15 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
                 return ret;
         }
  
-       root = btrfs_alloc_dummy_root(sectorsize, nodesize);
-       if (IS_ERR(root)) {
-               test_msg("Couldn't allocate root\n");
+       fs_info = btrfs_alloc_dummy_fs_info();
+       if (!fs_info) {
+               test_msg("Couldn't allocate dummy fs info\n");
                 goto out;
         }
  
-       root->fs_info = btrfs_alloc_dummy_fs_info();
-       if (!root->fs_info) {
-               test_msg("Couldn't allocate dummy fs info\n");
+       root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+       if (IS_ERR(root)) {
+               test_msg("Couldn't allocate root\n");
                 goto out;
         }
  
@@ -1132,6 +1133,7 @@ out:
                                  NULL, GFP_KERNEL);
         iput(inode);
         btrfs_free_dummy_root(root);
+       btrfs_free_dummy_fs_info(fs_info);
         return ret;
  }
  
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c

index 57a12c0d680ba300461ff5ee9714833b541f2810..4407fef7c16c1911cef2c6eec107018b6546d513 100644 (file)
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -453,22 +453,24 @@ static int test_multiple_refs(struct btrfs_root *root,
  
  int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
  {
+       struct btrfs_fs_info *fs_info = NULL;
         struct btrfs_root *root;
         struct btrfs_root *tmp_root;
         int ret = 0;
  
-       root = btrfs_alloc_dummy_root(sectorsize, nodesize);
-       if (IS_ERR(root)) {
-               test_msg("Couldn't allocate root\n");
-               return PTR_ERR(root);
+       fs_info = btrfs_alloc_dummy_fs_info();
+       if (!fs_info) {
+               test_msg("Couldn't allocate dummy fs info\n");
+               return -ENOMEM;
         }
  
-       root->fs_info = btrfs_alloc_dummy_fs_info();
-       if (!root->fs_info) {
-               test_msg("Couldn't allocate dummy fs info\n");
-               ret = -ENOMEM;
+       root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+       if (IS_ERR(root)) {
+               test_msg("Couldn't allocate root\n");
+               ret = PTR_ERR(root);
                 goto out;
         }
+
         /* We are using this root as our extent root */
         root->fs_info->extent_root = root;
  
@@ -495,7 +497,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
         btrfs_set_header_nritems(root->node, 0);
         root->alloc_bytenr += 2 * nodesize;
  
-       tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize);
+       tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
         if (IS_ERR(tmp_root)) {
                 test_msg("Couldn't allocate a fs root\n");
                 ret = PTR_ERR(tmp_root);
@@ -510,7 +512,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
                 goto out;
         }
  
-       tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize);
+       tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
         if (IS_ERR(tmp_root)) {
                 test_msg("Couldn't allocate a fs root\n");
                 ret = PTR_ERR(tmp_root);
@@ -531,5 +533,6 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
         ret = test_multiple_refs(root, sectorsize, nodesize);
  out:
         btrfs_free_dummy_root(root);
+       btrfs_free_dummy_fs_info(fs_info);
         return ret;
  }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index 948aa186b353caf55748fd983e975c127b654f0f..9cca0a72196180986b440865221a8aafa5659315 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -561,6 +561,7 @@ again:
         h->transaction = cur_trans;
         h->root = root;
         h->use_count = 1;
+       h->fs_info = root->fs_info;
  
         h->type = type;
         h->can_flush_pending_bgs = true;
@@ -1491,7 +1492,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                 goto dir_item_existed;
         } else if (IS_ERR(dir_item)) {
                 ret = PTR_ERR(dir_item);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
         btrfs_release_path(path);
@@ -1504,7 +1505,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
          */
         ret = btrfs_run_delayed_items(trans, root);
         if (ret) {      /* Transaction aborted */
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
  
@@ -1543,7 +1544,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         if (ret) {
                 btrfs_tree_unlock(old);
                 free_extent_buffer(old);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
  
@@ -1554,7 +1555,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         btrfs_tree_unlock(old);
         free_extent_buffer(old);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
         /* see comments in should_cow_block() */
@@ -1568,7 +1569,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         btrfs_tree_unlock(tmp);
         free_extent_buffer(tmp);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
  
@@ -1580,7 +1581,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                  btrfs_ino(parent_inode), index,
                                  dentry->d_name.name, dentry->d_name.len);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
  
@@ -1588,19 +1589,19 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
         if (IS_ERR(pending->snap)) {
                 ret = PTR_ERR(pending->snap);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
  
         ret = btrfs_reloc_post_snapshot(trans, pending);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
  
         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
  
@@ -1622,7 +1623,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         /* We have check then name at the beginning, so it is impossible. */
         BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
  
@@ -1632,13 +1633,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                 current_fs_time(parent_inode->i_sb);
         ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
         ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b,
                                   BTRFS_UUID_KEY_SUBVOL, objectid);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
         if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
@@ -1647,14 +1648,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                           BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                           objectid);
                 if (ret && ret != -EEXIST) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto fail;
                 }
         }
  
         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto fail;
         }
  
@@ -1709,7 +1710,7 @@ static void update_super_roots(struct btrfs_root *root)
         super->root = root_item->bytenr;
         super->generation = root_item->generation;
         super->root_level = root_item->level;
-       if (btrfs_test_opt(root, SPACE_CACHE))
+       if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
                 super->cache_generation = root_item->generation;
         if (root->fs_info->update_uuid_tree_gen)
                 super->uuid_tree_generation = root_item->generation;
@@ -1850,7 +1851,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
  
         WARN_ON(trans->use_count > 1);
  
-       btrfs_abort_transaction(trans, root, err);
+       btrfs_abort_transaction(trans, err);
  
         spin_lock(&root->fs_info->trans_lock);
  
@@ -1895,14 +1896,14 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
  
  static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
  {
-       if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+       if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
                 return btrfs_start_delalloc_roots(fs_info, 1, -1);
         return 0;
  }
  
  static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
  {
-       if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+       if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
                 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
  }
  
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h

index c5abee4f01add54f811c78fac76170f8b5c02bb3..efb1226433800b12c83af38fad763eddbbab5cbb 100644 (file)
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -128,6 +128,7 @@ struct btrfs_trans_handle {
          * Subvolume quota depends on this
          */
         struct btrfs_root *root;
+       struct btrfs_fs_info *fs_info;
         struct seq_list delayed_ref_elem;
         struct list_head qgroup_ref_list;
         struct list_head new_bgs;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index c05f69a8ec42dad19269620e0ec7e8b38f91ca25..d31a0c4f56bed436e0eb933cceb592fdc498eb53 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2757,7 +2757,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         while (1) {
                 int batch = atomic_read(&root->log_batch);
                 /* when we're on an ssd, just kick the log commit out */
-               if (!btrfs_test_opt(root, SSD) &&
+               if (!btrfs_test_opt(root->fs_info, SSD) &&
                     test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
                         mutex_unlock(&root->log_mutex);
                         schedule_timeout_uninterruptible(1);
@@ -2788,7 +2788,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
         if (ret) {
                 blk_finish_plug(&plug);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 btrfs_free_logged_extents(log, log_transid);
                 btrfs_set_log_full_commit(root->fs_info, trans);
                 mutex_unlock(&root->log_mutex);
@@ -2838,7 +2838,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                 btrfs_set_log_full_commit(root->fs_info, trans);
  
                 if (ret != -ENOSPC) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         mutex_unlock(&log_root_tree->log_mutex);
                         goto out;
                 }
@@ -2898,7 +2898,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         blk_finish_plug(&plug);
         if (ret) {
                 btrfs_set_log_full_commit(root->fs_info, trans);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 btrfs_free_logged_extents(log, log_transid);
                 mutex_unlock(&log_root_tree->log_mutex);
                 goto out_wake_log_root;
@@ -2934,7 +2934,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
         if (ret) {
                 btrfs_set_log_full_commit(root->fs_info, trans);
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out_wake_log_root;
         }
  
@@ -2991,7 +2991,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
         ret = walk_log_tree(trans, log, &wc);
         /* I don't think this can happen but just in case */
         if (ret)
-               btrfs_abort_transaction(trans, log, ret);
+               btrfs_abort_transaction(trans, ret);
  
         while (1) {
                 ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -3160,7 +3160,7 @@ out_unlock:
                 btrfs_set_log_full_commit(root->fs_info, trans);
                 ret = 0;
         } else if (ret < 0)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
  
         btrfs_end_log_trans(root);
  
@@ -3193,7 +3193,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
                 btrfs_set_log_full_commit(root->fs_info, trans);
                 ret = 0;
         } else if (ret < 0 && ret != -ENOENT)
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
         btrfs_end_log_trans(root);
  
         return ret;
@@ -4703,6 +4703,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                 ins_nr = 0;
                 ret = btrfs_search_forward(root, &min_key,
                                            path, trans->transid);
+               if (ret < 0) {
+                       err = ret;
+                       goto out_unlock;
+               }
                 if (ret != 0)
                         break;
  again:
@@ -5301,7 +5305,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
  
         sb = inode->i_sb;
  
-       if (btrfs_test_opt(root, NOTREELOG)) {
+       if (btrfs_test_opt(root->fs_info, NOTREELOG)) {
                 ret = 1;
                 goto end_no_trans;
         }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 0fb4a959012e0db9cff59828d6734d0cdfc761f5..bb0addce755865dbb3fb3726998e2792c90205ed 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -140,7 +140,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
  static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
  static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
  static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
-static void btrfs_close_one_device(struct btrfs_device *device);
  
  DEFINE_MUTEX(uuid_mutex);
  static LIST_HEAD(fs_uuids);
@@ -853,6 +852,46 @@ static void free_device(struct rcu_head *head)
         schedule_work(&device->rcu_work);
  }
  
+static void btrfs_close_one_device(struct btrfs_device *device)
+{
+       struct btrfs_fs_devices *fs_devices = device->fs_devices;
+       struct btrfs_device *new_device;
+       struct rcu_string *name;
+
+       if (device->bdev)
+               fs_devices->open_devices--;
+
+       if (device->writeable &&
+           device->devid != BTRFS_DEV_REPLACE_DEVID) {
+               list_del_init(&device->dev_alloc_list);
+               fs_devices->rw_devices--;
+       }
+
+       if (device->missing)
+               fs_devices->missing_devices--;
+
+       if (device->bdev && device->writeable) {
+               sync_blockdev(device->bdev);
+               invalidate_bdev(device->bdev);
+       }
+
+       new_device = btrfs_alloc_device(NULL, &device->devid,
+                                       device->uuid);
+       BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+       /* Safe because we are under uuid_mutex */
+       if (device->name) {
+               name = rcu_string_strdup(device->name->str, GFP_NOFS);
+               BUG_ON(!name); /* -ENOMEM */
+               rcu_assign_pointer(new_device->name, name);
+       }
+
+       list_replace_rcu(&device->dev_list, &new_device->dev_list);
+       new_device->fs_devices = device->fs_devices;
+
+       call_rcu(&device->rcu, free_device);
+}
+
  static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
  {
         struct btrfs_device *device, *tmp;
@@ -2399,14 +2438,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                 ret = init_first_rw_device(trans, root, device);
                 unlock_chunks(root);
                 if (ret) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto error_trans;
                 }
         }
  
         ret = btrfs_add_device(trans, root, device);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto error_trans;
         }
  
@@ -2415,7 +2454,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
  
                 ret = btrfs_finish_sprout(trans, root);
                 if (ret) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto error_trans;
                 }
  
@@ -2801,7 +2840,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
                                             &dev_extent_len);
                 if (ret) {
                         mutex_unlock(&fs_devices->device_list_mutex);
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
  
@@ -2820,7 +2859,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
                         ret = btrfs_update_device(trans, map->stripes[i].dev);
                         if (ret) {
                                 mutex_unlock(&fs_devices->device_list_mutex);
-                               btrfs_abort_transaction(trans, root, ret);
+                               btrfs_abort_transaction(trans, ret);
                                 goto out;
                         }
                 }
@@ -2829,7 +2868,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
  
         ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
         if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out;
         }
  
@@ -2838,14 +2877,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
                 if (ret) {
-                       btrfs_abort_transaction(trans, root, ret);
+                       btrfs_abort_transaction(trans, ret);
                         goto out;
                 }
         }
  
         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
         if (ret) {
-               btrfs_abort_transaction(trans, extent_root, ret);
+               btrfs_abort_transaction(trans, ret);
                 goto out;
         }
  
@@ -2902,7 +2941,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
          * chunk tree entries
          */
         ret = btrfs_remove_chunk(trans, root, chunk_offset);
-       btrfs_end_transaction(trans, root);
+       btrfs_end_transaction(trans, extent_root);
         return ret;
  }
  
@@ -3421,7 +3460,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
         u64 size_to_free;
         u64 chunk_type;
         struct btrfs_chunk *chunk;
-       struct btrfs_path *path;
+       struct btrfs_path *path = NULL;
         struct btrfs_key key;
         struct btrfs_key found_key;
         struct btrfs_trans_handle *trans;
@@ -3455,13 +3494,33 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
                 ret = btrfs_shrink_device(device, old_size - size_to_free);
                 if (ret == -ENOSPC)
                         break;
-               BUG_ON(ret);
+               if (ret) {
+                       /* btrfs_shrink_device never returns ret > 0 */
+                       WARN_ON(ret > 0);
+                       goto error;
+               }
  
                 trans = btrfs_start_transaction(dev_root, 0);
-               BUG_ON(IS_ERR(trans));
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       btrfs_info_in_rcu(fs_info,
+                "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
+                                         rcu_str_deref(device->name), ret,
+                                         old_size, old_size - size_to_free);
+                       goto error;
+               }
  
                 ret = btrfs_grow_device(trans, device, old_size);
-               BUG_ON(ret);
+               if (ret) {
+                       btrfs_end_transaction(trans, dev_root);
+                       /* btrfs_grow_device never returns ret > 0 */
+                       WARN_ON(ret > 0);
+                       btrfs_info_in_rcu(fs_info,
+                "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
+                                         rcu_str_deref(device->name), ret,
+                                         old_size, old_size - size_to_free);
+                       goto error;
+               }
  
                 btrfs_end_transaction(trans, dev_root);
         }
@@ -3885,7 +3944,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
         }
         spin_unlock(&fs_info->balance_lock);
  
-       if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+       if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
                 btrfs_info(fs_info, "force skipping balance");
                 return 0;
         }
@@ -4240,7 +4299,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
                                       BTRFS_UUID_TREE_OBJECTID);
         if (IS_ERR(uuid_root)) {
                 ret = PTR_ERR(uuid_root);
-               btrfs_abort_transaction(trans, tree_root, ret);
+               btrfs_abort_transaction(trans, ret);
                 btrfs_end_transaction(trans, tree_root);
                 return ret;
         }
@@ -4514,8 +4573,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
         btrfs_set_fs_incompat(info, RAID56);
  }
  
-#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r)            \
-                       - sizeof(struct btrfs_item)             \
+#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r)             \
                         - sizeof(struct btrfs_chunk))           \
                         / sizeof(struct btrfs_stripe) + 1)
  
@@ -6401,7 +6459,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                                    BTRFS_UUID_SIZE);
                 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
                                                         uuid, NULL);
-               if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
+               if (!map->stripes[i].dev &&
+                   !btrfs_test_opt(root->fs_info, DEGRADED)) {
                         free_extent_map(em);
                         return -EIO;
                 }
@@ -6469,7 +6528,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
  
         fs_devices = find_fsid(fsid);
         if (!fs_devices) {
-               if (!btrfs_test_opt(root, DEGRADED))
+               if (!btrfs_test_opt(root->fs_info, DEGRADED))
                         return ERR_PTR(-ENOENT);
  
                 fs_devices = alloc_fs_devices(fsid);
@@ -6531,7 +6590,7 @@ static int read_one_dev(struct btrfs_root *root,
  
         device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
         if (!device) {
-               if (!btrfs_test_opt(root, DEGRADED))
+               if (!btrfs_test_opt(root->fs_info, DEGRADED))
                         return -EIO;
  
                 device = add_missing_dev(root, fs_devices, devid, dev_uuid);
@@ -6540,7 +6599,7 @@ static int read_one_dev(struct btrfs_root *root,
                 btrfs_warn(root->fs_info, "devid %llu uuid %pU missing",
                                 devid, dev_uuid);
         } else {
-               if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
+               if (!device->bdev && !btrfs_test_opt(root->fs_info, DEGRADED))
                         return -EIO;
  
                 if(!device->bdev && !device->missing) {
@@ -7143,38 +7202,3 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
                 fs_devices = fs_devices->seed;
         }
  }
-
-static void btrfs_close_one_device(struct btrfs_device *device)
-{
-       struct btrfs_fs_devices *fs_devices = device->fs_devices;
-       struct btrfs_device *new_device;
-       struct rcu_string *name;
-
-       if (device->bdev)
-               fs_devices->open_devices--;
-
-       if (device->writeable &&
-           device->devid != BTRFS_DEV_REPLACE_DEVID) {
-               list_del_init(&device->dev_alloc_list);
-               fs_devices->rw_devices--;
-       }
-
-       if (device->missing)
-               fs_devices->missing_devices--;
-
-       new_device = btrfs_alloc_device(NULL, &device->devid,
-                                       device->uuid);
-       BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
-
-       /* Safe because we are under uuid_mutex */
-       if (device->name) {
-               name = rcu_string_strdup(device->name->str, GFP_NOFS);
-               BUG_ON(!name); /* -ENOMEM */
-               rcu_assign_pointer(new_device->name, name);
-       }
-
-       list_replace_rcu(&device->dev_list, &new_device->dev_list);
-       new_device->fs_devices = device->fs_devices;
-
-       call_rcu(&device->rcu, free_device);
-}
diff --git a/fs/exec.c b/fs/exec.c

index a1789cd684bf25dde64739c2c38b1bcc23cd3eb3..6fcfb3f7b137951b133d3db95431c23bc2d4677f 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -762,6 +762,39 @@ out_unlock:
  }
  EXPORT_SYMBOL(setup_arg_pages);
  
+#else
+
+/*
+ * Transfer the program arguments and environment from the holding pages
+ * onto the stack. The provided stack pointer is adjusted accordingly.
+ */
+int transfer_args_to_stack(struct linux_binprm *bprm,
+                          unsigned long *sp_location)
+{
+       unsigned long index, stop, sp;
+       int ret = 0;
+
+       stop = bprm->p >> PAGE_SHIFT;
+       sp = *sp_location;
+
+       for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
+               unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
+               char *src = kmap(bprm->page[index]) + offset;
+               sp -= PAGE_SIZE - offset;
+               if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
+                       ret = -EFAULT;
+               kunmap(bprm->page[index]);
+               if (ret)
+                       goto out;
+       }
+
+       *sp_location = sp;
+
+out:
+       return ret;
+}
+EXPORT_SYMBOL(transfer_args_to_stack);
+
  #endif /* CONFIG_MMU */
  
  static struct file *do_open_execat(int fd, struct filename *name, int flags)
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c

index 5c57654927a6185cb1601bd5c52d5ba4c0475291..90e46cd752fe7568bc13198898bc450a63cc9e36 100644 (file)
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -959,10 +959,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
  
         if (S_ISLNK(root_inode->i_mode)) {
                 char *name = follow_link(host_root_path);
-               if (IS_ERR(name))
+               if (IS_ERR(name)) {
                         err = PTR_ERR(name);
-               else
-                       err = read_name(root_inode, name);
+                       goto out_put;
+               }
+               err = read_name(root_inode, name);
                 kfree(name);
                 if (err)
                         goto out_put;
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig

index c9f583d7bac858c9ac3afc51cf4dd02838b473d5..47febcf9918502a2b55b99a571c50b6b1c824cc3 100644 (file)
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -90,6 +90,7 @@ config NFSD_BLOCKLAYOUT
         bool "NFSv4.1 server support for pNFS block layouts"
         depends on NFSD_V4 && BLOCK
         select NFSD_PNFS
+       select EXPORTFS_BLOCK_OPS
         help
           This option enables support for the exporting pNFS block layouts
           in the kernel's NFS server. The pNFS block layout enables NFS
@@ -102,6 +103,7 @@ config NFSD_SCSILAYOUT
         bool "NFSv4.1 server support for pNFS SCSI layouts"
         depends on NFSD_V4 && BLOCK
         select NFSD_PNFS
+       select EXPORTFS_BLOCK_OPS
         help
           This option enables support for the exporting pNFS SCSI layouts
           in the kernel's NFS server. The pNFS SCSI layout enables NFS
@@ -111,6 +113,23 @@ config NFSD_SCSILAYOUT
  
           If unsure, say N.
  
+config NFSD_FLEXFILELAYOUT
+       bool "NFSv4.1 server support for pNFS Flex File layouts"
+       depends on NFSD_V4
+       select NFSD_PNFS
+       help
+         This option enables support for the exporting pNFS Flex File
+         layouts in the kernel's NFS server. The pNFS Flex File  layout
+         enables NFS clients to directly perform I/O to NFSv3 devices
+         accesible to both the server and the clients.  See
+         draft-ietf-nfsv4-flex-files for more details.
+
+         Warning, this server implements the bare minimum functionality
+         to be a flex file server - it is for testing the client,
+         not for use in production.
+
+         If unsure, say N.
+
  config NFSD_V4_SECURITY_LABEL
         bool "Provide Security Label support for NFSv4 server"
         depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile

index 3ae5f3c77e28b15b532e25668a86403446dc4425..5f5d3a76980c02725018555e7c8f239dfb59f228 100644 (file)
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -20,3 +20,4 @@ nfsd-$(CONFIG_NFSD_V4)        += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
  nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
  nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
  nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c

index ad2c05e80a836fcc49002802357f049f61852732..5a17084415103a08b18cfbcb31bc80e9b422de53 100644 (file)
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -163,6 +163,7 @@ nfsd4_block_get_device_info_simple(struct super_block *sb,
  
  static __be32
  nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+               struct svc_rqst *rqstp,
                 struct nfs4_client *clp,
                 struct nfsd4_getdeviceinfo *gdp)
  {
@@ -355,6 +356,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
  
  static __be32
  nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
+               struct svc_rqst *rqstp,
                 struct nfs4_client *clp,
                 struct nfsd4_getdeviceinfo *gdp)
  {
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c

index 4ebaaf4b8d8a425d98418f09f9b445ff9615da32..ac6f54546fdde92b060e10839f631a574fa76cf5 100644 (file)
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -44,7 +44,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
  
         switch (b->type) {
         case PNFS_BLOCK_VOLUME_SIMPLE:
-               len = 4 + 4 + 8 + 4 + b->simple.sig_len;
+               len = 4 + 4 + 8 + 4 + (XDR_QUADLEN(b->simple.sig_len) << 2);
                 p = xdr_reserve_space(xdr, len);
                 if (!p)
                         return -ETOOSMALL;
@@ -55,7 +55,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
                 p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
                 break;
         case PNFS_BLOCK_VOLUME_SCSI:
-               len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8;
+               len = 4 + 4 + 4 + 4 + (XDR_QUADLEN(b->scsi.designator_len) << 2) + 8;
                 p = xdr_reserve_space(xdr, len);
                 if (!p)
                         return -ETOOSMALL;
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c

index b4d84b579f20cd5da76866586dfa283d64c6669a..43e109cc0ccc39e8293a7c8926bcb1c105951714 100644 (file)
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -706,7 +706,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
         new->ex_fslocs.locations = NULL;
         new->ex_fslocs.locations_count = 0;
         new->ex_fslocs.migrated = 0;
-       new->ex_layout_type = 0;
+       new->ex_layout_types = 0;
         new->ex_uuid = NULL;
         new->cd = item->cd;
  }
@@ -731,7 +731,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
         item->ex_fslocs.locations_count = 0;
         new->ex_fslocs.migrated = item->ex_fslocs.migrated;
         item->ex_fslocs.migrated = 0;
-       new->ex_layout_type = item->ex_layout_type;
+       new->ex_layout_types = item->ex_layout_types;
         new->ex_nflavors = item->ex_nflavors;
         for (i = 0; i < MAX_SECINFO_LIST; i++) {
                 new->ex_flavors[i] = item->ex_flavors[i];
@@ -954,6 +954,16 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
                     rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)
                         return 0;
         }
+
+       /* If the compound op contains a spo_must_allowed op,
+        * it will be sent with integrity/protection which
+        * will have to be expressly allowed on mounts that
+        * don't support it
+        */
+
+       if (nfsd4_spo_must_allow(rqstp))
+               return 0;
+
         return nfserr_wrongsec;
  }
  
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h

index 2e315072bf3fb83e62d0c469a2d43b2d3683392f..730f15eeb7ed561fb50b5485af00f2a5a8aca2f7 100644 (file)
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -57,7 +57,7 @@ struct svc_export {
         struct nfsd4_fs_locations ex_fslocs;
         uint32_t                ex_nflavors;
         struct exp_flavor_info  ex_flavors[MAX_SECINFO_LIST];
-       enum pnfs_layouttype    ex_layout_type;
+       u32                     ex_layout_types;
         struct nfsd4_deviceid_map *ex_devid_map;
         struct cache_detail     *cd;
  };
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c

new file mode 100644 (file)

index 0000000..df880e9
--- /dev/null
+++ b/fs/nfsd/flexfilelayout.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
+ *
+ * The following implements a super-simple flex-file server
+ * where the NFSv4.1 mds is also the ds. And the storage is
+ * the same. I.e., writing to the mds via a NFSv4.1 WRITE
+ * goes to the same location as the NFSv3 WRITE.
+ */
+#include <linux/slab.h>
+
+#include <linux/nfsd/debug.h>
+
+#include <linux/sunrpc/addr.h>
+
+#include "flexfilelayoutxdr.h"
+#include "pnfs.h"
+
+#define NFSDDBG_FACILITY       NFSDDBG_PNFS
+
+static __be32
+nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
+               struct nfsd4_layoutget *args)
+{
+       struct nfsd4_layout_seg *seg = &args->lg_seg;
+       u32 device_generation = 0;
+       int error;
+       uid_t u;
+
+       struct pnfs_ff_layout *fl;
+
+       /*
+        * The super simple flex file server has 1 mirror, 1 data server,
+        * and 1 file handle. So instead of 4 allocs, do 1 for now.
+        * Zero it out for the stateid - don't want junk in there!
+        */
+       error = -ENOMEM;
+       fl = kzalloc(sizeof(*fl), GFP_KERNEL);
+       if (!fl)
+               goto out_error;
+       args->lg_content = fl;
+
+       /*
+        * Avoid layout commit, try to force the I/O to the DS,
+        * and for fun, cause all IOMODE_RW layout segments to
+        * effectively be WRITE only.
+        */
+       fl->flags = FF_FLAGS_NO_LAYOUTCOMMIT | FF_FLAGS_NO_IO_THRU_MDS |
+                   FF_FLAGS_NO_READ_IO;
+
+       /* Do not allow a IOMODE_READ segment to have write pemissions */
+       if (seg->iomode == IOMODE_READ) {
+               u = from_kuid(&init_user_ns, inode->i_uid) + 1;
+               fl->uid = make_kuid(&init_user_ns, u);
+       } else
+               fl->uid = inode->i_uid;
+       fl->gid = inode->i_gid;
+
+       error = nfsd4_set_deviceid(&fl->deviceid, fhp, device_generation);
+       if (error)
+               goto out_error;
+
+       fl->fh.size = fhp->fh_handle.fh_size;
+       memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size);
+
+       /* Give whole file layout segments */
+       seg->offset = 0;
+       seg->length = NFS4_MAX_UINT64;
+
+       dprintk("GET: 0x%llx:0x%llx %d\n", seg->offset, seg->length,
+               seg->iomode);
+       return 0;
+
+out_error:
+       seg->length = 0;
+       return nfserrno(error);
+}
+
+static __be32
+nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp,
+               struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp)
+{
+       struct pnfs_ff_device_addr *da;
+
+       u16 port;
+       char addr[INET6_ADDRSTRLEN];
+
+       da = kzalloc(sizeof(struct pnfs_ff_device_addr), GFP_KERNEL);
+       if (!da)
+               return nfserrno(-ENOMEM);
+
+       gdp->gd_device = da;
+
+       da->version = 3;
+       da->minor_version = 0;
+
+       da->rsize = svc_max_payload(rqstp);
+       da->wsize = da->rsize;
+
+       rpc_ntop((struct sockaddr *)&rqstp->rq_daddr,
+                addr, INET6_ADDRSTRLEN);
+       if (rqstp->rq_daddr.ss_family == AF_INET) {
+               struct sockaddr_in *sin;
+
+               sin = (struct sockaddr_in *)&rqstp->rq_daddr;
+               port = ntohs(sin->sin_port);
+               snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp");
+               da->netaddr.netid_len = 3;
+       } else {
+               struct sockaddr_in6 *sin6;
+
+               sin6 = (struct sockaddr_in6 *)&rqstp->rq_daddr;
+               port = ntohs(sin6->sin6_port);
+               snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp6");
+               da->netaddr.netid_len = 4;
+       }
+
+       da->netaddr.addr_len =
+               snprintf(da->netaddr.addr, FF_ADDR_LEN + 1,
+                        "%s.%hhu.%hhu", addr, port >> 8, port & 0xff);
+
+       da->tightly_coupled = false;
+
+       return 0;
+}
+
+const struct nfsd4_layout_ops ff_layout_ops = {
+       .notify_types           =
+                       NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+       .proc_getdeviceinfo     = nfsd4_ff_proc_getdeviceinfo,
+       .encode_getdeviceinfo   = nfsd4_ff_encode_getdeviceinfo,
+       .proc_layoutget         = nfsd4_ff_proc_layoutget,
+       .encode_layoutget       = nfsd4_ff_encode_layoutget,
+};
diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c

new file mode 100644 (file)

index 0000000..5e3fd7f
--- /dev/null
+++ b/fs/nfsd/flexfilelayoutxdr.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/nfs4.h>
+
+#include "nfsd.h"
+#include "flexfilelayoutxdr.h"
+
+#define NFSDDBG_FACILITY       NFSDDBG_PNFS
+
+struct ff_idmap {
+       char buf[11];
+       int len;
+};
+
+__be32
+nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
+               struct nfsd4_layoutget *lgp)
+{
+       struct pnfs_ff_layout *fl = lgp->lg_content;
+       int len, mirror_len, ds_len, fh_len;
+       __be32 *p;
+
+       /*
+        * Unlike nfsd4_encode_user, we know these will
+        * always be stringified.
+        */
+       struct ff_idmap uid;
+       struct ff_idmap gid;
+
+       fh_len = 4 + fl->fh.size;
+
+       uid.len = sprintf(uid.buf, "%u", from_kuid(&init_user_ns, fl->uid));
+       gid.len = sprintf(gid.buf, "%u", from_kgid(&init_user_ns, fl->gid));
+
+       /* 8 + len for recording the length, name, and padding */
+       ds_len = 20 + sizeof(stateid_opaque_t) + 4 + fh_len +
+                8 + uid.len + 8 + gid.len;
+
+       mirror_len = 4 + ds_len;
+
+       /* The layout segment */
+       len = 20 + mirror_len;
+
+       p = xdr_reserve_space(xdr, sizeof(__be32) + len);
+       if (!p)
+               return nfserr_toosmall;
+
+       *p++ = cpu_to_be32(len);
+       p = xdr_encode_hyper(p, 0);             /* stripe unit of 1 */
+
+       *p++ = cpu_to_be32(1);                  /* single mirror */
+       *p++ = cpu_to_be32(1);                  /* single data server */
+
+       p = xdr_encode_opaque_fixed(p, &fl->deviceid,
+                       sizeof(struct nfsd4_deviceid));
+
+       *p++ = cpu_to_be32(1);                  /* efficiency */
+
+       *p++ = cpu_to_be32(fl->stateid.si_generation);
+       p = xdr_encode_opaque_fixed(p, &fl->stateid.si_opaque,
+                                   sizeof(stateid_opaque_t));
+
+       *p++ = cpu_to_be32(1);                  /* single file handle */
+       p = xdr_encode_opaque(p, fl->fh.data, fl->fh.size);
+
+       p = xdr_encode_opaque(p, uid.buf, uid.len);
+       p = xdr_encode_opaque(p, gid.buf, gid.len);
+
+       *p++ = cpu_to_be32(fl->flags);
+       *p++ = cpu_to_be32(0);                  /* No stats collect hint */
+
+       return 0;
+}
+
+__be32
+nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
+               struct nfsd4_getdeviceinfo *gdp)
+{
+       struct pnfs_ff_device_addr *da = gdp->gd_device;
+       int len;
+       int ver_len;
+       int addr_len;
+       __be32 *p;
+
+       /* len + padding for two strings */
+       addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len;
+       ver_len = 20;
+
+       len = 4 + ver_len + 4 + addr_len;
+
+       p = xdr_reserve_space(xdr, len + sizeof(__be32));
+       if (!p)
+               return nfserr_resource;
+
+       /*
+        * Fill in the overall length and number of volumes at the beginning
+        * of the layout.
+        */
+       *p++ = cpu_to_be32(len);
+       *p++ = cpu_to_be32(1);                  /* 1 netaddr */
+       p = xdr_encode_opaque(p, da->netaddr.netid, da->netaddr.netid_len);
+       p = xdr_encode_opaque(p, da->netaddr.addr, da->netaddr.addr_len);
+
+       *p++ = cpu_to_be32(1);                  /* 1 versions */
+
+       *p++ = cpu_to_be32(da->version);
+       *p++ = cpu_to_be32(da->minor_version);
+       *p++ = cpu_to_be32(da->rsize);
+       *p++ = cpu_to_be32(da->wsize);
+       *p++ = cpu_to_be32(da->tightly_coupled);
+
+       return 0;
+}
diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h

new file mode 100644 (file)

index 0000000..467defd
--- /dev/null
+++ b/fs/nfsd/flexfilelayoutxdr.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
+ */
+#ifndef _NFSD_FLEXFILELAYOUTXDR_H
+#define _NFSD_FLEXFILELAYOUTXDR_H 1
+
+#include <linux/inet.h>
+#include "xdr4.h"
+
+#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+#define FF_FLAGS_NO_IO_THRU_MDS  2
+#define FF_FLAGS_NO_READ_IO      4
+
+struct xdr_stream;
+
+#define FF_NETID_LEN           (4)
+#define FF_ADDR_LEN            (INET6_ADDRSTRLEN + 8)
+struct pnfs_ff_netaddr {
+       char                            netid[FF_NETID_LEN + 1];
+       char                            addr[FF_ADDR_LEN + 1];
+       u32                             netid_len;
+       u32                             addr_len;
+};
+
+struct pnfs_ff_device_addr {
+       struct pnfs_ff_netaddr          netaddr;
+       u32                             version;
+       u32                             minor_version;
+       u32                             rsize;
+       u32                             wsize;
+       bool                            tightly_coupled;
+};
+
+struct pnfs_ff_layout {
+       u32                             flags;
+       u32                             stats_collect_hint;
+       kuid_t                          uid;
+       kgid_t                          gid;
+       struct nfsd4_deviceid           deviceid;
+       stateid_t                       stateid;
+       struct nfs_fh                   fh;
+};
+
+__be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
+               struct nfsd4_getdeviceinfo *gdp);
+__be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
+               struct nfsd4_layoutget *lgp);
+
+#endif /* _NFSD_FLEXFILELAYOUTXDR_H */
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c

index 953c0755cb37e23697a2308800ccaf7bf85232cf..2be9602b0221bd19f7492dad6266f3fd78c4e050 100644 (file)
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -27,6 +27,9 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
  static const struct lock_manager_operations nfsd4_layouts_lm_ops;
  
  const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
+#ifdef CONFIG_NFSD_FLEXFILELAYOUT
+       [LAYOUT_FLEX_FILES]     = &ff_layout_ops,
+#endif
  #ifdef CONFIG_NFSD_BLOCKLAYOUT
         [LAYOUT_BLOCK_VOLUME]   = &bl_layout_ops,
  #endif
@@ -122,28 +125,35 @@ nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
  
  void nfsd4_setup_layout_type(struct svc_export *exp)
  {
+#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
         struct super_block *sb = exp->ex_path.mnt->mnt_sb;
+#endif
  
         if (!(exp->ex_flags & NFSEXP_PNFS))
                 return;
  
         /*
-        * Check if the file system supports exporting a block-like layout.
+        * If flex file is configured, use it by default. Otherwise
+        * check if the file system supports exporting a block-like layout.
          * If the block device supports reservations prefer the SCSI layout,
          * otherwise advertise the block layout.
          */
+#ifdef CONFIG_NFSD_FLEXFILELAYOUT
+       exp->ex_layout_types |= 1 << LAYOUT_FLEX_FILES;
+#endif
  #ifdef CONFIG_NFSD_BLOCKLAYOUT
+       /* overwrite flex file layout selection if needed */
         if (sb->s_export_op->get_uuid &&
             sb->s_export_op->map_blocks &&
             sb->s_export_op->commit_blocks)
-               exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+               exp->ex_layout_types |= 1 << LAYOUT_BLOCK_VOLUME;
  #endif
  #ifdef CONFIG_NFSD_SCSILAYOUT
         /* overwrite block layout selection if needed */
         if (sb->s_export_op->map_blocks &&
             sb->s_export_op->commit_blocks &&
             sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops)
-               exp->ex_layout_type = LAYOUT_SCSI;
+               exp->ex_layout_types |= 1 << LAYOUT_SCSI;
  #endif
  }
  
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c

index de1ff1d98bb188a5661893f25e67926b70f7182f..1fb222752b2b154d1c7171b6c1ea766a8472d09e 100644 (file)
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -605,8 +605,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
  
         fh_init(&resfh, NFS4_FHSIZE);
  
-       status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
-                          NFSD_MAY_CREATE);
+       status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_NOP);
         if (status)
                 return status;
  
@@ -1219,12 +1218,12 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
  static const struct nfsd4_layout_ops *
  nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
  {
-       if (!exp->ex_layout_type) {
+       if (!exp->ex_layout_types) {
                 dprintk("%s: export does not support pNFS\n", __func__);
                 return NULL;
         }
  
-       if (exp->ex_layout_type != layout_type) {
+       if (!(exp->ex_layout_types & (1 << layout_type))) {
                 dprintk("%s: layout type %d not supported\n",
                         __func__, layout_type);
                 return NULL;
@@ -1270,7 +1269,7 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
         nfserr = nfs_ok;
         if (gdp->gd_maxcount != 0) {
                 nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
-                                       cstate->session->se_client, gdp);
+                               rqstp, cstate->session->se_client, gdp);
         }
  
         gdp->gd_notify_types &= ops->notify_types;
@@ -2335,6 +2334,45 @@ static struct nfsd4_operation nfsd4_ops[] = {
         },
  };
  
+/**
+ * nfsd4_spo_must_allow - Determine if the compound op contains an
+ * operation that is allowed to be sent with machine credentials
+ *
+ * @rqstp: a pointer to the struct svc_rqst
+ *
+ * Checks to see if the compound contains a spo_must_allow op
+ * and confirms that it was sent with the proper machine creds.
+ */
+
+bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
+{
+       struct nfsd4_compoundres *resp = rqstp->rq_resp;
+       struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+       struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
+       struct nfsd4_compound_state *cstate = &resp->cstate;
+       struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow;
+       u32 opiter;
+
+       if (!cstate->minorversion)
+               return false;
+
+       if (cstate->spo_must_allowed == true)
+               return true;
+
+       opiter = resp->opcnt;
+       while (opiter < argp->opcnt) {
+               this = &argp->ops[opiter++];
+               if (test_bit(this->opnum, allow->u.longs) &&
+                       cstate->clp->cl_mach_cred &&
+                       nfsd4_mach_creds_match(cstate->clp, rqstp)) {
+                       cstate->spo_must_allowed = true;
+                       return true;
+               }
+       }
+       cstate->spo_must_allowed = false;
+       return false;
+}
+
  int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
  {
         struct nfsd4_operation *opdesc;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c

index 70d0b9b33031ee88985da10ff2d11c782a2b49f6..8410ca275db1aecf0a1f8a022b92cdc1597ff258 100644 (file)
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1200,27 +1200,6 @@ free_ol_stateid_reaplist(struct list_head *reaplist)
         }
  }
  
-static void release_lockowner(struct nfs4_lockowner *lo)
-{
-       struct nfs4_client *clp = lo->lo_owner.so_client;
-       struct nfs4_ol_stateid *stp;
-       struct list_head reaplist;
-
-       INIT_LIST_HEAD(&reaplist);
-
-       spin_lock(&clp->cl_lock);
-       unhash_lockowner_locked(lo);
-       while (!list_empty(&lo->lo_owner.so_stateids)) {
-               stp = list_first_entry(&lo->lo_owner.so_stateids,
-                               struct nfs4_ol_stateid, st_perstateowner);
-               WARN_ON(!unhash_lock_stateid(stp));
-               put_ol_stateid_locked(stp, &reaplist);
-       }
-       spin_unlock(&clp->cl_lock);
-       free_ol_stateid_reaplist(&reaplist);
-       nfs4_put_stateowner(&lo->lo_owner);
-}
-
  static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
                                        struct list_head *reaplist)
  {
@@ -1972,7 +1951,7 @@ static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp)
                service == RPC_GSS_SVC_PRIVACY;
  }
  
-static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
+bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
  {
         struct svc_cred *cr = &rqstp->rq_cred;
  
@@ -2388,6 +2367,22 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
  
         switch (exid->spa_how) {
         case SP4_MACH_CRED:
+               exid->spo_must_enforce[0] = 0;
+               exid->spo_must_enforce[1] = (
+                       1 << (OP_BIND_CONN_TO_SESSION - 32) |
+                       1 << (OP_EXCHANGE_ID - 32) |
+                       1 << (OP_CREATE_SESSION - 32) |
+                       1 << (OP_DESTROY_SESSION - 32) |
+                       1 << (OP_DESTROY_CLIENTID - 32));
+
+               exid->spo_must_allow[0] &= (1 << (OP_CLOSE) |
+                                       1 << (OP_OPEN_DOWNGRADE) |
+                                       1 << (OP_LOCKU) |
+                                       1 << (OP_DELEGRETURN));
+
+               exid->spo_must_allow[1] &= (
+                                       1 << (OP_TEST_STATEID - 32) |
+                                       1 << (OP_FREE_STATEID - 32));
                 if (!svc_rqst_integrity_protected(rqstp)) {
                         status = nfserr_inval;
                         goto out_nolock;
@@ -2424,7 +2419,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                                 status = nfserr_inval;
                                 goto out;
                         }
-                       if (!mach_creds_match(conf, rqstp)) {
+                       if (!nfsd4_mach_creds_match(conf, rqstp)) {
                                 status = nfserr_wrong_cred;
                                 goto out;
                         }
@@ -2473,6 +2468,8 @@ out_new:
                         goto out;
         }
         new->cl_minorversion = cstate->minorversion;
+       new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0];
+       new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1];
  
         gen_clid(new, nn);
         add_to_unconfirmed(new);
@@ -2676,7 +2673,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
  
         if (conf) {
                 status = nfserr_wrong_cred;
-               if (!mach_creds_match(conf, rqstp))
+               if (!nfsd4_mach_creds_match(conf, rqstp))
                         goto out_free_conn;
                 cs_slot = &conf->cl_cs_slot;
                 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
@@ -2692,7 +2689,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                         goto out_free_conn;
                 }
                 status = nfserr_wrong_cred;
-               if (!mach_creds_match(unconf, rqstp))
+               if (!nfsd4_mach_creds_match(unconf, rqstp))
                         goto out_free_conn;
                 cs_slot = &unconf->cl_cs_slot;
                 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
@@ -2801,7 +2798,7 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
         if (!session)
                 goto out_no_session;
         status = nfserr_wrong_cred;
-       if (!mach_creds_match(session->se_client, rqstp))
+       if (!nfsd4_mach_creds_match(session->se_client, rqstp))
                 goto out;
         status = nfsd4_map_bcts_dir(&bcts->dir);
         if (status)
@@ -2848,7 +2845,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
         if (!ses)
                 goto out_client_lock;
         status = nfserr_wrong_cred;
-       if (!mach_creds_match(ses->se_client, r))
+       if (!nfsd4_mach_creds_match(ses->se_client, r))
                 goto out_put_session;
         status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
         if (status)
@@ -3087,7 +3084,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
                 status = nfserr_stale_clientid;
                 goto out;
         }
-       if (!mach_creds_match(clp, rqstp)) {
+       if (!nfsd4_mach_creds_match(clp, rqstp)) {
                 clp = NULL;
                 status = nfserr_wrong_cred;
                 goto out;
@@ -3112,7 +3109,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
                  * We don't take advantage of the rca_one_fs case.
                  * That's OK, it's optional, we can safely ignore it.
                  */
-                return nfs_ok;
+               return nfs_ok;
         }
  
         status = nfserr_complete_already;
@@ -5945,6 +5942,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
         __be32 status;
         struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
         struct nfs4_client *clp;
+       LIST_HEAD (reaplist);
  
         dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
                 clid->cl_boot, clid->cl_id);
@@ -5975,9 +5973,23 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
                 nfs4_get_stateowner(sop);
                 break;
         }
+       if (!lo) {
+               spin_unlock(&clp->cl_lock);
+               return status;
+       }
+
+       unhash_lockowner_locked(lo);
+       while (!list_empty(&lo->lo_owner.so_stateids)) {
+               stp = list_first_entry(&lo->lo_owner.so_stateids,
+                                      struct nfs4_ol_stateid,
+                                      st_perstateowner);
+               WARN_ON(!unhash_lock_stateid(stp));
+               put_ol_stateid_locked(stp, &reaplist);
+       }
         spin_unlock(&clp->cl_lock);
-       if (lo)
-               release_lockowner(lo);
+       free_ol_stateid_reaplist(&reaplist);
+       nfs4_put_stateowner(&lo->lo_owner);
+
         return status;
  }
  
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c

index 9df898ba648f73a14b1e42be47dbfddf189ec0a6..0aa0236a142904c6de5123b87ef96c55043c7f7d 100644 (file)
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1299,16 +1299,14 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
                 break;
         case SP4_MACH_CRED:
                 /* spo_must_enforce */
-               READ_BUF(4);
-               dummy = be32_to_cpup(p++);
-               READ_BUF(dummy * 4);
-               p += dummy;
-
+               status = nfsd4_decode_bitmap(argp,
+                                       exid->spo_must_enforce);
+               if (status)
+                       goto out;
                 /* spo_must_allow */
-               READ_BUF(4);
-               dummy = be32_to_cpup(p++);
-               READ_BUF(dummy * 4);
-               p += dummy;
+               status = nfsd4_decode_bitmap(argp, exid->spo_must_allow);
+               if (status)
+                       goto out;
                 break;
         case SP4_SSV:
                 /* ssp_ops */
@@ -2164,22 +2162,20 @@ nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
  }
  
  static inline __be32
-nfsd4_encode_layout_type(struct xdr_stream *xdr, enum pnfs_layouttype layout_type)
+nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types)
  {
-       __be32 *p;
+       __be32          *p;
+       unsigned long   i = hweight_long(layout_types);
  
-       if (layout_type) {
-               p = xdr_reserve_space(xdr, 8);
-               if (!p)
-                       return nfserr_resource;
-               *p++ = cpu_to_be32(1);
-               *p++ = cpu_to_be32(layout_type);
-       } else {
-               p = xdr_reserve_space(xdr, 4);
-               if (!p)
-                       return nfserr_resource;
-               *p++ = cpu_to_be32(0);
-       }
+       p = xdr_reserve_space(xdr, 4 + 4 * i);
+       if (!p)
+               return nfserr_resource;
+
+       *p++ = cpu_to_be32(i);
+
+       for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
+               if (layout_types & (1 << i))
+                       *p++ = cpu_to_be32(i);
  
         return 0;
  }
@@ -2754,13 +2750,13 @@ out_acl:
         }
  #ifdef CONFIG_NFSD_PNFS
         if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
-               status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type);
+               status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
                 if (status)
                         goto out;
         }
  
         if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) {
-               status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type);
+               status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
                 if (status)
                         goto out;
         }
@@ -3867,14 +3863,6 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
         return nfserr;
  }
  
-static const u32 nfs4_minimal_spo_must_enforce[2] = {
-       [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
-             1 << (OP_EXCHANGE_ID - 32) |
-             1 << (OP_CREATE_SESSION - 32) |
-             1 << (OP_DESTROY_SESSION - 32) |
-             1 << (OP_DESTROY_CLIENTID - 32)
-};
-
  static __be32
  nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
                          struct nfsd4_exchange_id *exid)
@@ -3885,6 +3873,7 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
         char *server_scope;
         int major_id_sz;
         int server_scope_sz;
+       int status = 0;
         uint64_t minor_id = 0;
  
         if (nfserr)
@@ -3913,18 +3902,20 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
         case SP4_NONE:
                 break;
         case SP4_MACH_CRED:
-               /* spo_must_enforce, spo_must_allow */
-               p = xdr_reserve_space(xdr, 16);
-               if (!p)
-                       return nfserr_resource;
-
                 /* spo_must_enforce bitmap: */
-               *p++ = cpu_to_be32(2);
-               *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[0]);
-               *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[1]);
-               /* empty spo_must_allow bitmap: */
-               *p++ = cpu_to_be32(0);
-
+               status = nfsd4_encode_bitmap(xdr,
+                                       exid->spo_must_enforce[0],
+                                       exid->spo_must_enforce[1],
+                                       exid->spo_must_enforce[2]);
+               if (status)
+                       goto out;
+               /* spo_must_allow bitmap: */
+               status = nfsd4_encode_bitmap(xdr,
+                                       exid->spo_must_allow[0],
+                                       exid->spo_must_allow[1],
+                                       exid->spo_must_allow[2]);
+               if (status)
+                       goto out;
                 break;
         default:
                 WARN_ON_ONCE(1);
@@ -3951,6 +3942,8 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
         /* Implementation id */
         *p++ = cpu_to_be32(0);  /* zero length nfs_impl_id4 array */
         return 0;
+out:
+       return status;
  }
  
  static __be32
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h

index cf980523898b78cc98debc868e2b5a249531ab18..9446849888d52e470d763e75d5dbb5e1f3e60f41 100644 (file)
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -124,6 +124,7 @@ void nfs4_state_shutdown_net(struct net *net);
  void nfs4_reset_lease(time_t leasetime);
  int nfs4_reset_recoverydir(char *recdir);
  char * nfs4_recoverydir(void);
+bool nfsd4_spo_must_allow(struct svc_rqst *rqstp);
  #else
  static inline int nfsd4_init_slabs(void) { return 0; }
  static inline void nfsd4_free_slabs(void) { }
@@ -134,6 +135,10 @@ static inline void nfs4_state_shutdown_net(struct net *net) { }
  static inline void nfs4_reset_lease(time_t leasetime) { }
  static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
  static inline char * nfs4_recoverydir(void) {return NULL; }
+static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
+{
+       return false;
+}
  #endif
  
  /*
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c

index a8919444c46085545ef99bd9b4b72f984745557e..cfe7500d5847baaca753a064d132ed6dd4d58c3b 100644 (file)
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -59,14 +59,20 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
   * the write call).
   */
  static inline __be32
-nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, umode_t requested)
+nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry,
+               umode_t requested)
  {
-       mode &= S_IFMT;
+       umode_t mode = d_inode(dentry)->i_mode & S_IFMT;
  
         if (requested == 0) /* the caller doesn't care */
                 return nfs_ok;
-       if (mode == requested)
+       if (mode == requested) {
+               if (mode == S_IFDIR && !d_can_lookup(dentry)) {
+                       WARN_ON_ONCE(1);
+                       return nfserr_notdir;
+               }
                 return nfs_ok;
+       }
         /*
          * v4 has an error more specific than err_notdir which we should
          * return in preference to err_notdir:
@@ -298,7 +304,7 @@ out:
   * that it expects something not of the given type.
   *
   * @access is formed from the NFSD_MAY_* constants defined in
- * include/linux/nfsd/nfsd.h.
+ * fs/nfsd/vfs.h.
   */
  __be32
  fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
@@ -340,7 +346,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
         if (error)
                 goto out;
  
-       error = nfsd_mode_check(rqstp, d_inode(dentry)->i_mode, type);
+       error = nfsd_mode_check(rqstp, dentry, type);
         if (error)
                 goto out;
  
@@ -533,7 +539,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
          * the reference filehandle (if it is in the same export)
          * or the export options.
          */
-        set_version_and_fsid_type(fhp, exp, ref_fh);
+       set_version_and_fsid_type(fhp, exp, ref_fh);
  
         if (ref_fh == fhp)
                 fh_put(ref_fh);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c

index 4cd78ef4c95c4def33831856e863a36fc850ff59..e9214768cde90f84138f6279e115a7a8c7350b95 100644 (file)
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -251,9 +251,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
  
         /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */
  
-       nfserr = nfserr_acces;
-       if (!argp->len)
-               goto done;
         nfserr = nfserr_exist;
         if (isdotent(argp->name, argp->len))
                 goto done;
@@ -362,8 +359,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
         nfserr = 0;
         if (!inode) {
                 /* File doesn't exist. Create it and set attrs */
-               nfserr = nfsd_create(rqstp, dirfhp, argp->name, argp->len,
-                                       attr, type, rdev, newfhp);
+               nfserr = nfsd_create_locked(rqstp, dirfhp, argp->name,
+                                       argp->len, attr, type, rdev, newfhp);
         } else if (type == S_IFREG) {
                 dprintk("nfsd:   existing %s, valid=%x, size=%ld\n",
                         argp->name, attr->ia_valid, (long) attr->ia_size);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c

index 79d964aa8079f354fcadf50ac2a1d3d216668d4a..41b468a6a90f807fe3f3d2e4ceeaa9f8c7ae0f8c 100644 (file)
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -240,7 +240,7 @@ nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
          || !(p = decode_filename(p, &args->name, &args->len)))
                 return 0;
  
-        return xdr_argsize_check(rqstp, p);
+       return xdr_argsize_check(rqstp, p);
  }
  
  int
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h

index 7d073b9b1553041d32910ef41de7883a90af3b37..0c2a716e87411d6b08258632e966666f34190b82 100644 (file)
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -21,6 +21,7 @@ struct nfsd4_layout_ops {
         u32             notify_types;
  
         __be32 (*proc_getdeviceinfo)(struct super_block *sb,
+                       struct svc_rqst *rqstp,
                         struct nfs4_client *clp,
                         struct nfsd4_getdeviceinfo *gdevp);
         __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
@@ -44,6 +45,9 @@ extern const struct nfsd4_layout_ops bl_layout_ops;
  #ifdef CONFIG_NFSD_SCSILAYOUT
  extern const struct nfsd4_layout_ops scsi_layout_ops;
  #endif
+#ifdef CONFIG_NFSD_FLEXFILELAYOUT
+extern const struct nfsd4_layout_ops ff_layout_ops;
+#endif
  
  __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
                 struct nfsd4_compound_state *cstate, stateid_t *stateid,
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h

index 64053eadeb818f2a754bd791af7010b992e99d1a..b95adf9a15954b02a9a37f165f3fc8ef331ccfb8 100644 (file)
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -345,6 +345,7 @@ struct nfs4_client {
         u32                     cl_exchange_flags;
         /* number of rpc's in progress over an associated session: */
         atomic_t                cl_refcount;
+       struct nfs4_op_map      cl_spo_must_allow;
  
         /* for nfs41 callbacks */
         /* We currently support a single back channel with a single slot */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c

index 6fbd81ecb41080a6e81aa712fc45af77c3a4fde1..ba944123167b92f3a7460d8acc66b02dc7c53575 100644 (file)
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1135,96 +1135,37 @@ nfsd_check_ignore_resizing(struct iattr *iap)
                 iap->ia_valid &= ~ATTR_SIZE;
  }
  
-/*
- * Create a file (regular, directory, device, fifo); UNIX sockets 
- * not yet implemented.
- * If the response fh has been verified, the parent directory should
- * already be locked. Note that the parent directory is left locked.
- *
- * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
- */
+/* The parent directory should already be locked: */
  __be32
-nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
                 char *fname, int flen, struct iattr *iap,
                 int type, dev_t rdev, struct svc_fh *resfhp)
  {
-       struct dentry   *dentry, *dchild = NULL;
+       struct dentry   *dentry, *dchild;
         struct inode    *dirp;
         __be32          err;
         __be32          err2;
         int             host_err;
  
-       err = nfserr_perm;
-       if (!flen)
-               goto out;
-       err = nfserr_exist;
-       if (isdotent(fname, flen))
-               goto out;
-
-       err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
-       if (err)
-               goto out;
-
         dentry = fhp->fh_dentry;
         dirp = d_inode(dentry);
  
-       err = nfserr_notdir;
-       if (!dirp->i_op->lookup)
-               goto out;
-       /*
-        * Check whether the response file handle has been verified yet.
-        * If it has, the parent directory should already be locked.
-        */
-       if (!resfhp->fh_dentry) {
-               host_err = fh_want_write(fhp);
-               if (host_err)
-                       goto out_nfserr;
-
-               /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
-               fh_lock_nested(fhp, I_MUTEX_PARENT);
-               dchild = lookup_one_len(fname, dentry, flen);
-               host_err = PTR_ERR(dchild);
-               if (IS_ERR(dchild))
-                       goto out_nfserr;
-               err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
-               if (err)
-                       goto out;
-       } else {
-               /* called from nfsd_proc_create */
-               dchild = dget(resfhp->fh_dentry);
-               if (!fhp->fh_locked) {
-                       /* not actually possible */
-                       printk(KERN_ERR
-                               "nfsd_create: parent %pd2 not locked!\n",
+       dchild = dget(resfhp->fh_dentry);
+       if (!fhp->fh_locked) {
+               WARN_ONCE(1, "nfsd_create: parent %pd2 not locked!\n",
                                 dentry);
-                       err = nfserr_io;
-                       goto out;
-               }
-       }
-       /*
-        * Make sure the child dentry is still negative ...
-        */
-       err = nfserr_exist;
-       if (d_really_is_positive(dchild)) {
-               dprintk("nfsd_create: dentry %pd/%pd not negative!\n",
-                       dentry, dchild);
-               goto out; 
+               err = nfserr_io;
+               goto out;
         }
  
+       err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE);
+       if (err)
+               goto out;
+
         if (!(iap->ia_valid & ATTR_MODE))
                 iap->ia_mode = 0;
         iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
  
-       err = nfserr_inval;
-       if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) {
-               printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
-                      type);
-               goto out;
-       }
-
-       /*
-        * Get the dir op function pointer.
-        */
         err = 0;
         host_err = 0;
         switch (type) {
@@ -1242,6 +1183,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
         case S_IFSOCK:
                 host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
                 break;
+       default:
+               printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
+                      type);
+               host_err = -EINVAL;
         }
         if (host_err < 0)
                 goto out_nfserr;
@@ -1251,7 +1196,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
         /*
          * nfsd_create_setattr already committed the child.  Transactional
          * filesystems had a chance to commit changes for both parent and
-        * child * simultaneously making the following commit_metadata a
+        * child simultaneously making the following commit_metadata a
          * noop.
          */
         err2 = nfserrno(commit_metadata(fhp));
@@ -1263,8 +1208,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
         if (!err)
                 err = fh_update(resfhp);
  out:
-       if (dchild && !IS_ERR(dchild))
-               dput(dchild);
+       dput(dchild);
         return err;
  
  out_nfserr:
@@ -1272,6 +1216,50 @@ out_nfserr:
         goto out;
  }
  
+/*
+ * Create a filesystem object (regular, directory, special).
+ * Note that the parent directory is left locked.
+ *
+ * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
+ */
+__be32
+nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+               char *fname, int flen, struct iattr *iap,
+               int type, dev_t rdev, struct svc_fh *resfhp)
+{
+       struct dentry   *dentry, *dchild = NULL;
+       struct inode    *dirp;
+       __be32          err;
+       int             host_err;
+
+       if (isdotent(fname, flen))
+               return nfserr_exist;
+
+       err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_NOP);
+       if (err)
+               return err;
+
+       dentry = fhp->fh_dentry;
+       dirp = d_inode(dentry);
+
+       host_err = fh_want_write(fhp);
+       if (host_err)
+               return nfserrno(host_err);
+
+       fh_lock_nested(fhp, I_MUTEX_PARENT);
+       dchild = lookup_one_len(fname, dentry, flen);
+       host_err = PTR_ERR(dchild);
+       if (IS_ERR(dchild))
+               return nfserrno(host_err);
+       err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
+       if (err) {
+               dput(dchild);
+               return err;
+       }
+       return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type,
+                                       rdev, resfhp);
+}
+
  #ifdef CONFIG_NFSD_V3
  
  /*
@@ -1304,12 +1292,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
         dentry = fhp->fh_dentry;
         dirp = d_inode(dentry);
  
-       /* Get all the sanity checks out of the way before
-        * we lock the parent. */
-       err = nfserr_notdir;
-       if (!dirp->i_op->lookup)
-               goto out;
-
         host_err = fh_want_write(fhp);
         if (host_err)
                 goto out_nfserr;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h

index 2d573ec057f801d6406ccf51d21549eb59009a44..3cbb1b33777b5219aef4116fb6e07fd83183a8a8 100644 (file)
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -59,6 +59,9 @@ __be32                nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
  __be32         nfsd4_clone_file_range(struct file *, u64, struct file *,
                         u64, u64);
  #endif /* CONFIG_NFSD_V4 */
+__be32         nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
+                               char *name, int len, struct iattr *attrs,
+                               int type, dev_t rdev, struct svc_fh *res);
  __be32         nfsd_create(struct svc_rqst *, struct svc_fh *,
                                 char *name, int len, struct iattr *attrs,
                                 int type, dev_t rdev, struct svc_fh *res);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h

index d9554813e58afaa15d8e94292e776ea7b7f27f01..beea0c5edc51436cb3525fa0f2cc58463f07a3e0 100644 (file)
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -59,6 +59,7 @@ struct nfsd4_compound_state {
         struct nfsd4_session    *session;
         struct nfsd4_slot       *slot;
         int                     data_offset;
+       bool                    spo_must_allowed;
         size_t                  iovlen;
         u32                     minorversion;
         __be32                  status;
@@ -403,6 +404,8 @@ struct nfsd4_exchange_id {
         clientid_t      clientid;
         u32             seqid;
         int             spa_how;
+       u32             spo_must_enforce[3];
+       u32             spo_must_allow[3];
  };
  
  struct nfsd4_sequence {
@@ -654,6 +657,8 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
  
  }
  
+
+bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp);
  int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
  int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
                 struct nfsd4_compoundargs *);
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c

index 9718da86ad01a804db1e3c7e2a2ca6923f6299a3..821b34816976a112df8d950555500ed9bbf64361 100644 (file)
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -100,10 +100,6 @@ static int switch_gc_head(struct ubifs_info *c)
         if (err)
                 return err;
  
-       err = ubifs_wbuf_sync_nolock(wbuf);
-       if (err)
-               return err;
-
         err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
         if (err)
                 return err;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c

index 70349954e78b3118f23c031773caf3c038f84552..4ec051089186ea2aef6bb4f96e22fefa24a10777 100644 (file)
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -520,19 +520,19 @@ static int init_constants_early(struct ubifs_info *c)
         c->max_write_shift = fls(c->max_write_size) - 1;
  
         if (c->leb_size < UBIFS_MIN_LEB_SZ) {
-               ubifs_err(c, "too small LEBs (%d bytes), min. is %d bytes",
-                         c->leb_size, UBIFS_MIN_LEB_SZ);
+               ubifs_errc(c, "too small LEBs (%d bytes), min. is %d bytes",
+                          c->leb_size, UBIFS_MIN_LEB_SZ);
                 return -EINVAL;
         }
  
         if (c->leb_cnt < UBIFS_MIN_LEB_CNT) {
-               ubifs_err(c, "too few LEBs (%d), min. is %d",
-                         c->leb_cnt, UBIFS_MIN_LEB_CNT);
+               ubifs_errc(c, "too few LEBs (%d), min. is %d",
+                          c->leb_cnt, UBIFS_MIN_LEB_CNT);
                 return -EINVAL;
         }
  
         if (!is_power_of_2(c->min_io_size)) {
-               ubifs_err(c, "bad min. I/O size %d", c->min_io_size);
+               ubifs_errc(c, "bad min. I/O size %d", c->min_io_size);
                 return -EINVAL;
         }
  
@@ -543,8 +543,8 @@ static int init_constants_early(struct ubifs_info *c)
         if (c->max_write_size < c->min_io_size ||
             c->max_write_size % c->min_io_size ||
             !is_power_of_2(c->max_write_size)) {
-               ubifs_err(c, "bad write buffer size %d for %d min. I/O unit",
-                         c->max_write_size, c->min_io_size);
+               ubifs_errc(c, "bad write buffer size %d for %d min. I/O unit",
+                          c->max_write_size, c->min_io_size);
                 return -EINVAL;
         }
  
@@ -2108,8 +2108,9 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
          */
         ubi = open_ubi(name, UBI_READONLY);
         if (IS_ERR(ubi)) {
-               pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
-                      current->pid, name, (int)PTR_ERR(ubi));
+               if (!(flags & MS_SILENT))
+                       pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
+                              current->pid, name, (int)PTR_ERR(ubi));
                 return ERR_CAST(ubi);
         }
  
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h

index ddf9f6b9eee24400d676917a2c899c784ac5b86f..4617d459022a5df4cfa2645bea329c31fa3b7a00 100644 (file)
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1783,8 +1783,8 @@ void ubifs_err(const struct ubifs_info *c, const char *fmt, ...);
  __printf(2, 3)
  void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...);
  /*
- * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
- * object as an argument.
+ * A conditional variant of 'ubifs_err()' which doesn't output anything
+ * if probing (ie. MS_SILENT set).
   */
  #define ubifs_errc(c, fmt, ...)                                                \
  do {                                                                   \
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c

index b5fc27969e9d31bc0a9e951cc50a83c66d9c1352..e237811f09ce5ce6139aab0951047a74dd715665 100644 (file)
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -592,19 +592,19 @@ static int ubifs_xattr_set(const struct xattr_handler *handler,
                 return __ubifs_removexattr(inode, name);
  }
  
-const struct xattr_handler ubifs_user_xattr_handler = {
+static const struct xattr_handler ubifs_user_xattr_handler = {
         .prefix = XATTR_USER_PREFIX,
         .get = ubifs_xattr_get,
         .set = ubifs_xattr_set,
  };
  
-const struct xattr_handler ubifs_trusted_xattr_handler = {
+static const struct xattr_handler ubifs_trusted_xattr_handler = {
         .prefix = XATTR_TRUSTED_PREFIX,
         .get = ubifs_xattr_get,
         .set = ubifs_xattr_set,
  };
  
-const struct xattr_handler ubifs_security_xattr_handler = {
+static const struct xattr_handler ubifs_security_xattr_handler = {
         .prefix = XATTR_SECURITY_PREFIX,
         .get = ubifs_xattr_get,
         .set = ubifs_xattr_set,
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile

index 3542d94fddce5ca4a45e33e08a25fc894b2264eb..52c288514be1ff729a38f7b6d9a8e7232ea60b67 100644 (file)
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -121,5 +121,4 @@ xfs-$(CONFIG_XFS_RT)                += xfs_rtalloc.o
  xfs-$(CONFIG_XFS_POSIX_ACL)    += xfs_acl.o
  xfs-$(CONFIG_SYSCTL)           += xfs_sysctl.o
  xfs-$(CONFIG_COMPAT)           += xfs_ioctl32.o
-xfs-$(CONFIG_NFSD_BLOCKLAYOUT) += xfs_pnfs.o
-xfs-$(CONFIG_NFSD_SCSILAYOUT)  += xfs_pnfs.o
+xfs-$(CONFIG_EXPORTFS_BLOCK_OPS)       += xfs_pnfs.o
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c

index a1b2dd828b9d0ae447419b304793d03e9ec144f5..fe1bfee35898ea4d9b657c7935f90f586e7ab9c9 100644 (file)
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = {
         .fh_to_parent           = xfs_fs_fh_to_parent,
         .get_parent             = xfs_fs_get_parent,
         .commit_metadata        = xfs_fs_nfs_commit_metadata,
-#ifdef CONFIG_NFSD_BLOCKLAYOUT
+#ifdef CONFIG_EXPORTFS_BLOCK_OPS
         .get_uuid               = xfs_fs_get_uuid,
         .map_blocks             = xfs_fs_map_blocks,
         .commit_blocks          = xfs_fs_commit_blocks,
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h

index 93f74853961b1cce598f5b83e96e622457ae173e..e8339f74966b18b43a929ed29f3d9259aa17142b 100644 (file)
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -1,7 +1,7 @@
  #ifndef _XFS_PNFS_H
  #define _XFS_PNFS_H 1
  
-#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
+#ifdef CONFIG_EXPORTFS_BLOCK_OPS
  int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
  int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
                 struct iomap *iomap, bool write, u32 *device_generation);
@@ -15,5 +15,5 @@ xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
  {
         return 0;
  }
-#endif /* CONFIG_NFSD_PNFS */
+#endif /* CONFIG_EXPORTFS_BLOCK_OPS */
  #endif /* _XFS_PNFS_H */
diff --git a/include/asm-generic/rtc.h b/include/asm-generic/rtc.h

deleted file mode 100644 (file)

index 4e3b655..0000000
--- a/include/asm-generic/rtc.h
+++ /dev/null
@@ -1,247 +0,0 @@
-/* 
- * include/asm-generic/rtc.h
- *
- * Author: Tom Rini <trini@mvista.com>
- *
- * Based on:
- * drivers/char/rtc.c
- *
- * Please read the COPYING file for all license details.
- */
-
-#ifndef __ASM_RTC_H__
-#define __ASM_RTC_H__
-
-#include <linux/mc146818rtc.h>
-#include <linux/rtc.h>
-#include <linux/bcd.h>
-#include <linux/delay.h>
-#ifdef CONFIG_ACPI
-#include <linux/acpi.h>
-#endif
-
-#define RTC_PIE 0x40           /* periodic interrupt enable */
-#define RTC_AIE 0x20           /* alarm interrupt enable */
-#define RTC_UIE 0x10           /* update-finished interrupt enable */
-
-/* some dummy definitions */
-#define RTC_BATT_BAD 0x100     /* battery bad */
-#define RTC_SQWE 0x08          /* enable square-wave output */
-#define RTC_DM_BINARY 0x04     /* all time/date values are BCD if clear */
-#define RTC_24H 0x02           /* 24 hour mode - else hours bit 7 means pm */
-#define RTC_DST_EN 0x01                /* auto switch DST - works f. USA only */
-
-/*
- * Returns true if a clock update is in progress
- */
-static inline unsigned char rtc_is_updating(void)
-{
-       unsigned char uip;
-       unsigned long flags;
-
-       spin_lock_irqsave(&rtc_lock, flags);
-       uip = (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP);
-       spin_unlock_irqrestore(&rtc_lock, flags);
-       return uip;
-}
-
-static inline unsigned int __get_rtc_time(struct rtc_time *time)
-{
-       unsigned char ctrl;
-       unsigned long flags;
-       unsigned char century = 0;
-
-#ifdef CONFIG_MACH_DECSTATION
-       unsigned int real_year;
-#endif
-
-       /*
-        * read RTC once any update in progress is done. The update
-        * can take just over 2ms. We wait 20ms. There is no need to
-        * to poll-wait (up to 1s - eeccch) for the falling edge of RTC_UIP.
-        * If you need to know *exactly* when a second has started, enable
-        * periodic update complete interrupts, (via ioctl) and then 
-        * immediately read /dev/rtc which will block until you get the IRQ.
-        * Once the read clears, read the RTC time (again via ioctl). Easy.
-        */
-       if (rtc_is_updating())
-               mdelay(20);
-
-       /*
-        * Only the values that we read from the RTC are set. We leave
-        * tm_wday, tm_yday and tm_isdst untouched. Even though the
-        * RTC has RTC_DAY_OF_WEEK, we ignore it, as it is only updated
-        * by the RTC when initially set to a non-zero value.
-        */
-       spin_lock_irqsave(&rtc_lock, flags);
-       time->tm_sec = CMOS_READ(RTC_SECONDS);
-       time->tm_min = CMOS_READ(RTC_MINUTES);
-       time->tm_hour = CMOS_READ(RTC_HOURS);
-       time->tm_mday = CMOS_READ(RTC_DAY_OF_MONTH);
-       time->tm_mon = CMOS_READ(RTC_MONTH);
-       time->tm_year = CMOS_READ(RTC_YEAR);
-#ifdef CONFIG_MACH_DECSTATION
-       real_year = CMOS_READ(RTC_DEC_YEAR);
-#endif
-#ifdef CONFIG_ACPI
-       if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
-           acpi_gbl_FADT.century)
-               century = CMOS_READ(acpi_gbl_FADT.century);
-#endif
-       ctrl = CMOS_READ(RTC_CONTROL);
-       spin_unlock_irqrestore(&rtc_lock, flags);
-
-       if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-       {
-               time->tm_sec = bcd2bin(time->tm_sec);
-               time->tm_min = bcd2bin(time->tm_min);
-               time->tm_hour = bcd2bin(time->tm_hour);
-               time->tm_mday = bcd2bin(time->tm_mday);
-               time->tm_mon = bcd2bin(time->tm_mon);
-               time->tm_year = bcd2bin(time->tm_year);
-               century = bcd2bin(century);
-       }
-
-#ifdef CONFIG_MACH_DECSTATION
-       time->tm_year += real_year - 72;
-#endif
-
-       if (century)
-               time->tm_year += (century - 19) * 100;
-
-       /*
-        * Account for differences between how the RTC uses the values
-        * and how they are defined in a struct rtc_time;
-        */
-       if (time->tm_year <= 69)
-               time->tm_year += 100;
-
-       time->tm_mon--;
-
-       return RTC_24H;
-}
-
-#ifndef get_rtc_time
-#define get_rtc_time   __get_rtc_time
-#endif
-
-/* Set the current date and time in the real time clock. */
-static inline int __set_rtc_time(struct rtc_time *time)
-{
-       unsigned long flags;
-       unsigned char mon, day, hrs, min, sec;
-       unsigned char save_control, save_freq_select;
-       unsigned int yrs;
-#ifdef CONFIG_MACH_DECSTATION
-       unsigned int real_yrs, leap_yr;
-#endif
-       unsigned char century = 0;
-
-       yrs = time->tm_year;
-       mon = time->tm_mon + 1;   /* tm_mon starts at zero */
-       day = time->tm_mday;
-       hrs = time->tm_hour;
-       min = time->tm_min;
-       sec = time->tm_sec;
-
-       if (yrs > 255)  /* They are unsigned */
-               return -EINVAL;
-
-       spin_lock_irqsave(&rtc_lock, flags);
-#ifdef CONFIG_MACH_DECSTATION
-       real_yrs = yrs;
-       leap_yr = ((!((yrs + 1900) % 4) && ((yrs + 1900) % 100)) ||
-                       !((yrs + 1900) % 400));
-       yrs = 72;
-
-       /*
-        * We want to keep the year set to 73 until March
-        * for non-leap years, so that Feb, 29th is handled
-        * correctly.
-        */
-       if (!leap_yr && mon < 3) {
-               real_yrs--;
-               yrs = 73;
-       }
-#endif
-
-#ifdef CONFIG_ACPI
-       if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
-           acpi_gbl_FADT.century) {
-               century = (yrs + 1900) / 100;
-               yrs %= 100;
-       }
-#endif
-
-       /* These limits and adjustments are independent of
-        * whether the chip is in binary mode or not.
-        */
-       if (yrs > 169) {
-               spin_unlock_irqrestore(&rtc_lock, flags);
-               return -EINVAL;
-       }
-
-       if (yrs >= 100)
-               yrs -= 100;
-
-       if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)
-           || RTC_ALWAYS_BCD) {
-               sec = bin2bcd(sec);
-               min = bin2bcd(min);
-               hrs = bin2bcd(hrs);
-               day = bin2bcd(day);
-               mon = bin2bcd(mon);
-               yrs = bin2bcd(yrs);
-               century = bin2bcd(century);
-       }
-
-       save_control = CMOS_READ(RTC_CONTROL);
-       CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
-       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
-       CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
-
-#ifdef CONFIG_MACH_DECSTATION
-       CMOS_WRITE(real_yrs, RTC_DEC_YEAR);
-#endif
-       CMOS_WRITE(yrs, RTC_YEAR);
-       CMOS_WRITE(mon, RTC_MONTH);
-       CMOS_WRITE(day, RTC_DAY_OF_MONTH);
-       CMOS_WRITE(hrs, RTC_HOURS);
-       CMOS_WRITE(min, RTC_MINUTES);
-       CMOS_WRITE(sec, RTC_SECONDS);
-#ifdef CONFIG_ACPI
-       if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
-           acpi_gbl_FADT.century)
-               CMOS_WRITE(century, acpi_gbl_FADT.century);
-#endif
-
-       CMOS_WRITE(save_control, RTC_CONTROL);
-       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
-
-       spin_unlock_irqrestore(&rtc_lock, flags);
-
-       return 0;
-}
-
-#ifndef set_rtc_time
-#define set_rtc_time   __set_rtc_time
-#endif
-
-static inline unsigned int get_rtc_ss(void)
-{
-       struct rtc_time h;
-
-       get_rtc_time(&h);
-       return h.tm_sec;
-}
-
-static inline int get_rtc_pll(struct rtc_pll_info *pll)
-{
-       return -EINVAL;
-}
-static inline int set_rtc_pll(struct rtc_pll_info *pll)
-{
-       return -EINVAL;
-}
-
-#endif /* __ASM_RTC_H__ */
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h

index 314b3caa701cc20a6c3fcd628de3c48b7c0c7d21..1303b570b18cc98f0c92cc5d08a309c605468cd8 100644 (file)
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -113,6 +113,8 @@ extern int suid_dumpable;
  extern int setup_arg_pages(struct linux_binprm * bprm,
                            unsigned long stack_top,
                            int executable_stack);
+extern int transfer_args_to_stack(struct linux_binprm *bprm,
+                                 unsigned long *sp_location);
  extern int bprm_change_interp(char *interp, struct linux_binprm *bprm);
  extern int copy_strings_kernel(int argc, const char *const *argv,
                                struct linux_binprm *bprm);
diff --git a/include/linux/ds1286.h b/include/linux/ds1286.h

deleted file mode 100644 (file)

index 45ea0aa..0000000
--- a/include/linux/ds1286.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 1998, 1999, 2003 Ralf Baechle
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#ifndef __LINUX_DS1286_H
-#define __LINUX_DS1286_H
-
-/**********************************************************************
- * register summary
- **********************************************************************/
-#define RTC_HUNDREDTH_SECOND   0
-#define RTC_SECONDS            1
-#define RTC_MINUTES            2
-#define RTC_MINUTES_ALARM      3
-#define RTC_HOURS              4
-#define RTC_HOURS_ALARM                5
-#define RTC_DAY                        6
-#define RTC_DAY_ALARM          7
-#define RTC_DATE               8
-#define RTC_MONTH              9
-#define RTC_YEAR               10
-#define RTC_CMD                        11
-#define RTC_WHSEC              12
-#define RTC_WSEC               13
-#define RTC_UNUSED             14
-
-/* RTC_*_alarm is always true if 2 MSBs are set */
-# define RTC_ALARM_DONT_CARE   0xC0
-
-
-/*
- * Bits in the month register
- */
-#define RTC_EOSC               0x80
-#define RTC_ESQW               0x40
-
-/*
- * Bits in the Command register
- */
-#define RTC_TDF                        0x01
-#define RTC_WAF                        0x02
-#define RTC_TDM                        0x04
-#define RTC_WAM                        0x08
-#define RTC_PU_LVL             0x10
-#define RTC_IBH_LO             0x20
-#define RTC_IPSW               0x40
-#define RTC_TE                 0x80
-
-#endif /* __LINUX_DS1286_H */
diff --git a/include/linux/ds17287rtc.h b/include/linux/ds17287rtc.h

deleted file mode 100644 (file)

index d85d3f4..0000000
--- a/include/linux/ds17287rtc.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * ds17287rtc.h - register definitions for the ds1728[57] RTC / CMOS RAM
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * (C) 2003 Guido Guenther <agx@sigxcpu.org>
- */
-#ifndef __LINUX_DS17287RTC_H
-#define __LINUX_DS17287RTC_H
-
-#include <linux/rtc.h>                 /* get the user-level API */
-#include <linux/mc146818rtc.h>
-
-/* Register A */
-#define DS_REGA_DV2    0x40            /* countdown chain */
-#define DS_REGA_DV1    0x20            /* oscillator enable */
-#define DS_REGA_DV0    0x10            /* bank select */
-
-/* bank 1 registers */
-#define DS_B1_MODEL    0x40            /* model number byte */
-#define DS_B1_SN1      0x41            /* serial number byte 1 */
-#define DS_B1_SN2      0x42            /* serial number byte 2 */
-#define DS_B1_SN3      0x43            /* serial number byte 3 */
-#define DS_B1_SN4      0x44            /* serial number byte 4 */
-#define DS_B1_SN5      0x45            /* serial number byte 5 */
-#define DS_B1_SN6      0x46            /* serial number byte 6 */
-#define DS_B1_CRC      0x47            /* CRC byte */
-#define DS_B1_CENTURY  0x48            /* Century byte */
-#define DS_B1_DALARM   0x49            /* date alarm */
-#define DS_B1_XCTRL4A  0x4a            /* extendec control register 4a */
-#define DS_B1_XCTRL4B  0x4b            /* extendec control register 4b */
-#define DS_B1_RTCADDR2         0x4e            /* rtc address 2 */
-#define DS_B1_RTCADDR3         0x4f            /* rtc address 3 */
-#define DS_B1_RAMLSB   0x50            /* extended ram LSB */
-#define DS_B1_RAMMSB   0x51            /* extended ram MSB */
-#define DS_B1_RAMDPORT 0x53            /* extended ram data port */
-
-/* register details */
-/* extended control register 4a */
-#define DS_XCTRL4A_VRT2        0x80            /* valid ram and time */
-#define DS_XCTRL4A_INCR        0x40            /* increment progress status */
-#define DS_XCTRL4A_BME 0x20            /* burst mode enable */
-#define DS_XCTRL4A_PAB 0x08            /* power active bar ctrl */
-#define DS_XCTRL4A_RF  0x04            /* ram clear flag */
-#define DS_XCTRL4A_WF  0x02            /* wake up alarm flag */
-#define DS_XCTRL4A_KF  0x01            /* kickstart flag */
-
-/* interrupt causes */
-#define DS_XCTRL4A_IFS (DS_XCTRL4A_RF|DS_XCTRL4A_WF|DS_XCTRL4A_KF)
-
-/* extended control register 4b */
-#define DS_XCTRL4B_ABE 0x80            /* auxiliary battery enable */
-#define DS_XCTRL4B_E32K        0x40            /* enable 32.768 kHz Output */
-#define DS_XCTRL4B_CS  0x20            /* crystal select */
-#define DS_XCTRL4B_RCE 0x10            /* ram clear enable */
-#define DS_XCTRL4B_PRS 0x08            /* PAB resec select */
-#define DS_XCTRL4B_RIE 0x04            /* ram clear interrupt enable */
-#define DS_XCTRL4B_WFE 0x02            /* wake up alarm interrupt enable */
-#define DS_XCTRL4B_KFE 0x01            /* kickstart interrupt enable */
-
-/* interrupt enable bits */
-#define DS_XCTRL4B_IFES        (DS_XCTRL4B_RIE|DS_XCTRL4B_WFE|DS_XCTRL4B_KFE)
-
-#endif /* __LINUX_DS17287RTC_H */
diff --git a/include/linux/i8042.h b/include/linux/i8042.h

index 0f9bafa17a02dde9af7f4866ca8b9f741c83ddf1..d98780ca9604a7cae22d6c103c1b54eea2e22d0d 100644 (file)
--- a/include/linux/i8042.h
+++ b/include/linux/i8042.h
@@ -62,7 +62,6 @@ struct serio;
  void i8042_lock_chip(void);
  void i8042_unlock_chip(void);
  int i8042_command(unsigned char *param, int command);
-bool i8042_check_port_owner(const struct serio *);
  int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str,
                                         struct serio *serio));
  int i8042_remove_filter(bool (*filter)(unsigned char data, unsigned char str,
@@ -83,11 +82,6 @@ static inline int i8042_command(unsigned char *param, int command)
         return -ENODEV;
  }
  
-static inline bool i8042_check_port_owner(const struct serio *serio)
-{
-       return false;
-}
-
  static inline int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str,
                                         struct serio *serio))
  {
diff --git a/include/linux/m48t86.h b/include/linux/m48t86.h

deleted file mode 100644 (file)

index 915d6b4..0000000
--- a/include/linux/m48t86.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * ST M48T86 / Dallas DS12887 RTC driver
- * Copyright (c) 2006 Tower Technologies
- *
- * Author: Alessandro Zummo <a.zummo@towertech.it>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-struct m48t86_ops
-{
-       void (*writebyte)(unsigned char value, unsigned long addr);
-       unsigned char (*readbyte)(unsigned long addr);
-};
diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h

index 433e0c74d643fecb192e777aaeee485c65a1da0d..a585b4b5fa0ef70e21cac6b02b0c4150ed3cc68b 100644 (file)
--- a/include/linux/mc146818rtc.h
+++ b/include/linux/mc146818rtc.h
@@ -14,6 +14,8 @@
  #include <asm/io.h>
  #include <linux/rtc.h>                 /* get the user-level API */
  #include <asm/mc146818rtc.h>           /* register access macros */
+#include <linux/bcd.h>
+#include <linux/delay.h>
  
  #ifdef __KERNEL__
  #include <linux/spinlock.h>            /* spinlock_t */
@@ -120,4 +122,7 @@ struct cmos_rtc_board_info {
  #define RTC_IO_EXTENT_USED      RTC_IO_EXTENT
  #endif /* ARCH_RTC_LOCATION */
  
+unsigned int mc146818_get_time(struct rtc_time *time);
+int mc146818_set_time(struct rtc_time *time);
+
  #endif /* _MC146818RTC_H */
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h

index e6f6910278f30a3fa734da4308175574eb1f0cac..42da3552f7cbe00f00c9c01ab49651da421aa80a 100644 (file)
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -220,6 +220,7 @@ enum {
         MLX4_DEV_CAP_FLAG2_LB_SRC_CHK           = 1ULL << 32,
         MLX4_DEV_CAP_FLAG2_ROCE_V1_V2           = 1ULL <<  33,
         MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER   = 1ULL <<  34,
+       MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT        = 1ULL <<  35,
  };
  
  enum {
@@ -1342,6 +1343,9 @@ enum {
         VXLAN_STEER_BY_INNER_VLAN       = 1 << 4,
  };
  
+enum {
+       MLX4_OP_MOD_QUERY_TRANSPORT_CI_ERRORS = 0x2,
+};
  
  int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn,
                                 enum mlx4_net_trans_promisc_mode mode);
@@ -1382,6 +1386,9 @@ void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
  int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
  int mlx4_SYNC_TPT(struct mlx4_dev *dev);
  int mlx4_test_interrupts(struct mlx4_dev *dev);
+int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier,
+                            const u32 offset[], u32 value[],
+                            size_t array_len, u8 port);
  u32 mlx4_get_eqs_per_port(struct mlx4_dev *dev, u8 port);
  bool mlx4_is_eq_vector_valid(struct mlx4_dev *dev, u8 port, int vector);
  struct cpu_rmap *mlx4_get_cpu_rmap(struct mlx4_dev *dev, int port);
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h

index 2be976dd49669c21829c5c798711786e2cb68f74..2566f6d6444f15a02d7df36aed04f4f740b5e4f5 100644 (file)
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -58,6 +58,8 @@ struct mlx5_core_cq {
                 void (*comp)(struct mlx5_core_cq *);
                 void            *priv;
         } tasklet_ctx;
+       int                     reset_notify_added;
+       struct list_head        reset_notify;
  };
  
  
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h

index a041b99fceac5bf9d5c70fb637146ec60b3a8eac..ccea6fb1648287fa25f9675df045eec867295528 100644 (file)
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -46,6 +46,7 @@
  
  #include <linux/mlx5/device.h>
  #include <linux/mlx5/doorbell.h>
+#include <linux/mlx5/srq.h>
  
  enum {
         MLX5_RQ_BITMASK_VSD = 1 << 1,
@@ -798,11 +799,10 @@ struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
  void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
                                  struct mlx5_cmd_mailbox *head);
  int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                        struct mlx5_create_srq_mbox_in *in, int inlen,
-                        int is_xrc);
+                        struct mlx5_srq_attr *in);
  int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq);
  int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-                       struct mlx5_query_srq_mbox_out *out);
+                       struct mlx5_srq_attr *out);
  int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
                       u16 lwm, int is_srq);
  void mlx5_init_mkey_table(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h

index ab310819ac3605a426c35ae273d4a66926794c48..7879bf41189105c816e6deec5bfdc4218d5e4c86 100644 (file)
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -556,9 +556,9 @@ struct mlx5_destroy_qp_mbox_out {
  struct mlx5_modify_qp_mbox_in {
         struct mlx5_inbox_hdr   hdr;
         __be32                  qpn;
-       u8                      rsvd1[4];
-       __be32                  optparam;
         u8                      rsvd0[4];
+       __be32                  optparam;
+       u8                      rsvd1[4];
         struct mlx5_qp_context  ctx;
         u8                      rsvd2[16];
  };
diff --git a/include/linux/mlx5/srq.h b/include/linux/mlx5/srq.h

index f43ed054a3e0904c2b99a844fdd79964c633fe9a..33c97dc900f82493f9f5bf6a4f375d37cbe4a4f8 100644 (file)
--- a/include/linux/mlx5/srq.h
+++ b/include/linux/mlx5/srq.h
@@ -35,6 +35,31 @@
  
  #include <linux/mlx5/driver.h>
  
+enum {
+       MLX5_SRQ_FLAG_ERR    = (1 << 0),
+       MLX5_SRQ_FLAG_WQ_SIG = (1 << 1),
+};
+
+struct mlx5_srq_attr {
+       u32 type;
+       u32 flags;
+       u32 log_size;
+       u32 wqe_shift;
+       u32 log_page_size;
+       u32 wqe_cnt;
+       u32 srqn;
+       u32 xrcd;
+       u32 page_offset;
+       u32 cqn;
+       u32 pd;
+       u32 lwm;
+       u32 user_index;
+       u64 db_record;
+       u64 *pas;
+};
+
+struct mlx5_core_dev;
+
  void mlx5_init_srq_table(struct mlx5_core_dev *dev);
  void mlx5_cleanup_srq_table(struct mlx5_core_dev *dev);
  
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h

index bfed6b367350cdd34fe4e87598ec5cb94aac60f8..c6564ada9bebb8ea0787e60fc8cea3b46d09433c 100644 (file)
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -643,4 +643,15 @@ enum pnfs_update_layout_reason {
         PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET,
  };
  
+#define NFS4_OP_MAP_NUM_LONGS                                  \
+       DIV_ROUND_UP(LAST_NFS4_OP, 8 * sizeof(unsigned long))
+#define NFS4_OP_MAP_NUM_WORDS \
+       (NFS4_OP_MAP_NUM_LONGS * sizeof(unsigned long) / sizeof(u32))
+struct nfs4_op_map {
+       union {
+               unsigned long longs[NFS4_OP_MAP_NUM_LONGS];
+               u32 words[NFS4_OP_MAP_NUM_WORDS];
+       } u;
+};
+
  #endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h

index 82b81a1c24382740366dfeea40ec11dd611372a6..5bcbbe511be663a91b1f50886e549974ca1e7af3 100644 (file)
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1185,17 +1185,6 @@ struct pnfs_ds_commit_info {
         struct pnfs_commit_bucket *buckets;
  };
  
-#define NFS4_OP_MAP_NUM_LONGS \
-       DIV_ROUND_UP(LAST_NFS4_OP, 8 * sizeof(unsigned long))
-#define NFS4_OP_MAP_NUM_WORDS \
-       (NFS4_OP_MAP_NUM_LONGS * sizeof(unsigned long) / sizeof(u32))
-struct nfs4_op_map {
-       union {
-               unsigned long longs[NFS4_OP_MAP_NUM_LONGS];
-               u32 words[NFS4_OP_MAP_NUM_WORDS];
-       } u;
-};
-
  struct nfs41_state_protection {
         u32 how;
         struct nfs4_op_map enforce;
diff --git a/include/linux/platform_data/rtc-ds2404.h b/include/linux/platform_data/rtc-ds2404.h

new file mode 100644 (file)

index 0000000..22c5382
--- /dev/null
+++ b/include/linux/platform_data/rtc-ds2404.h
@@ -0,0 +1,20 @@
+/*
+ * ds2404.h - platform data structure for the DS2404 RTC.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2012 Sven Schnelle <svens@stackframe.org>
+ */
+
+#ifndef __LINUX_DS2404_H
+#define __LINUX_DS2404_H
+
+struct ds2404_platform_data {
+
+       unsigned int gpio_rst;
+       unsigned int gpio_clk;
+       unsigned int gpio_dq;
+};
+#endif
diff --git a/include/linux/platform_data/rtc-m48t86.h b/include/linux/platform_data/rtc-m48t86.h

new file mode 100644 (file)

index 0000000..915d6b4
--- /dev/null
+++ b/include/linux/platform_data/rtc-m48t86.h
@@ -0,0 +1,16 @@
+/*
+ * ST M48T86 / Dallas DS12887 RTC driver
+ * Copyright (c) 2006 Tower Technologies
+ *
+ * Author: Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+struct m48t86_ops
+{
+       void (*writebyte)(unsigned char value, unsigned long addr);
+       unsigned char (*readbyte)(unsigned long addr);
+};
diff --git a/include/linux/platform_data/rtc-v3020.h b/include/linux/platform_data/rtc-v3020.h

new file mode 100644 (file)

index 0000000..e55d82c
--- /dev/null
+++ b/include/linux/platform_data/rtc-v3020.h
@@ -0,0 +1,41 @@
+/*
+ * v3020.h - Registers definition and platform data structure for the v3020 RTC.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2006, 8D Technologies inc.
+ */
+#ifndef __LINUX_V3020_H
+#define __LINUX_V3020_H
+
+/* The v3020 has only one data pin but which one
+ * is used depends on the board. */
+struct v3020_platform_data {
+       int leftshift; /* (1<<(leftshift)) & readl() */
+
+       unsigned int use_gpio:1;
+       unsigned int gpio_cs;
+       unsigned int gpio_wr;
+       unsigned int gpio_rd;
+       unsigned int gpio_io;
+};
+
+#define V3020_STATUS_0 0x00
+#define V3020_STATUS_1 0x01
+#define V3020_SECONDS  0x02
+#define V3020_MINUTES  0x03
+#define V3020_HOURS            0x04
+#define V3020_MONTH_DAY        0x05
+#define V3020_MONTH            0x06
+#define V3020_YEAR             0x07
+#define V3020_WEEK_DAY 0x08
+#define V3020_WEEK             0x09
+
+#define V3020_IS_COMMAND(val) ((val)>=0x0E)
+
+#define V3020_CMD_RAM2CLOCK    0x0E
+#define V3020_CMD_CLOCK2RAM    0x0F
+
+#endif /* __LINUX_V3020_H */
diff --git a/include/linux/rtc-ds2404.h b/include/linux/rtc-ds2404.h

deleted file mode 100644 (file)

index 22c5382..0000000
--- a/include/linux/rtc-ds2404.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * ds2404.h - platform data structure for the DS2404 RTC.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2012 Sven Schnelle <svens@stackframe.org>
- */
-
-#ifndef __LINUX_DS2404_H
-#define __LINUX_DS2404_H
-
-struct ds2404_platform_data {
-
-       unsigned int gpio_rst;
-       unsigned int gpio_clk;
-       unsigned int gpio_dq;
-};
-#endif
diff --git a/include/linux/rtc-v3020.h b/include/linux/rtc-v3020.h

deleted file mode 100644 (file)

index e55d82c..0000000
--- a/include/linux/rtc-v3020.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * v3020.h - Registers definition and platform data structure for the v3020 RTC.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2006, 8D Technologies inc.
- */
-#ifndef __LINUX_V3020_H
-#define __LINUX_V3020_H
-
-/* The v3020 has only one data pin but which one
- * is used depends on the board. */
-struct v3020_platform_data {
-       int leftshift; /* (1<<(leftshift)) & readl() */
-
-       unsigned int use_gpio:1;
-       unsigned int gpio_cs;
-       unsigned int gpio_wr;
-       unsigned int gpio_rd;
-       unsigned int gpio_io;
-};
-
-#define V3020_STATUS_0 0x00
-#define V3020_STATUS_1 0x01
-#define V3020_SECONDS  0x02
-#define V3020_MINUTES  0x03
-#define V3020_HOURS            0x04
-#define V3020_MONTH_DAY        0x05
-#define V3020_MONTH            0x06
-#define V3020_YEAR             0x07
-#define V3020_WEEK_DAY 0x08
-#define V3020_WEEK             0x09
-
-#define V3020_IS_COMMAND(val) ((val)>=0x0E)
-
-#define V3020_CMD_RAM2CLOCK    0x0E
-#define V3020_CMD_CLOCK2RAM    0x0F
-
-#endif /* __LINUX_V3020_H */
diff --git a/include/linux/rtc/ds1286.h b/include/linux/rtc/ds1286.h

new file mode 100644 (file)

index 0000000..45ea0aa
--- /dev/null
+++ b/include/linux/rtc/ds1286.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 1998, 1999, 2003 Ralf Baechle
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#ifndef __LINUX_DS1286_H
+#define __LINUX_DS1286_H
+
+/**********************************************************************
+ * register summary
+ **********************************************************************/
+#define RTC_HUNDREDTH_SECOND   0
+#define RTC_SECONDS            1
+#define RTC_MINUTES            2
+#define RTC_MINUTES_ALARM      3
+#define RTC_HOURS              4
+#define RTC_HOURS_ALARM                5
+#define RTC_DAY                        6
+#define RTC_DAY_ALARM          7
+#define RTC_DATE               8
+#define RTC_MONTH              9
+#define RTC_YEAR               10
+#define RTC_CMD                        11
+#define RTC_WHSEC              12
+#define RTC_WSEC               13
+#define RTC_UNUSED             14
+
+/* RTC_*_alarm is always true if 2 MSBs are set */
+# define RTC_ALARM_DONT_CARE   0xC0
+
+
+/*
+ * Bits in the month register
+ */
+#define RTC_EOSC               0x80
+#define RTC_ESQW               0x40
+
+/*
+ * Bits in the Command register
+ */
+#define RTC_TDF                        0x01
+#define RTC_WAF                        0x02
+#define RTC_TDM                        0x04
+#define RTC_WAM                        0x08
+#define RTC_PU_LVL             0x10
+#define RTC_IBH_LO             0x20
+#define RTC_IPSW               0x40
+#define RTC_TE                 0x80
+
+#endif /* __LINUX_DS1286_H */
diff --git a/include/linux/serio.h b/include/linux/serio.h

index df4ab5de15862c7ba35fb532174db56866ffff47..c733cff44e18a74f3949032d1413356df0a9e226 100644 (file)
--- a/include/linux/serio.h
+++ b/include/linux/serio.h
@@ -31,7 +31,8 @@ struct serio {
  
         struct serio_device_id id;
  
-       spinlock_t lock;                /* protects critical sections from port's interrupt handler */
+       /* Protects critical sections from port's interrupt handler */
+       spinlock_t lock;
  
         int (*write)(struct serio *, unsigned char);
         int (*open)(struct serio *);
@@ -40,16 +41,29 @@ struct serio {
         void (*stop)(struct serio *);
  
         struct serio *parent;
-       struct list_head child_node;    /* Entry in parent->children list */
+       /* Entry in parent->children list */
+       struct list_head child_node;
         struct list_head children;
-       unsigned int depth;             /* level of nesting in serio hierarchy */
+       /* Level of nesting in serio hierarchy */
+       unsigned int depth;
  
-       struct serio_driver *drv;       /* accessed from interrupt, must be protected by serio->lock and serio->sem */
-       struct mutex drv_mutex;         /* protects serio->drv so attributes can pin driver */
+       /*
+        * serio->drv is accessed from interrupt handlers; when modifying
+        * caller should acquire serio->drv_mutex and serio->lock.
+        */
+       struct serio_driver *drv;
+       /* Protects serio->drv so attributes can pin current driver */
+       struct mutex drv_mutex;
  
         struct device dev;
  
         struct list_head node;
+
+       /*
+        * For use by PS/2 layer when several ports share hardware and
+        * may get indigestion when exposed to concurrent access (i8042).
+        */
+       struct mutex *ps2_cmd_mutex;
  };
  #define to_serio_port(d)       container_of(d, struct serio, dev)
  
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h

index ed03c9f7f908d7a477ef5349f35439b30b014b85..62a60eeacb0aeaf9907a5247d42706851fec3ffb 100644 (file)
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -78,8 +78,6 @@ struct cache_detail {
         struct hlist_head *     hash_table;
         rwlock_t                hash_lock;
  
-       atomic_t                inuse; /* active user-space update or lookup */
-
         char                    *name;
         void                    (*cache_put)(struct kref *);
  
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h

index 7ca44fb5b675d1c078a2a2f121056606282dad58..7321ae933867566013a250623564d722d2800305 100644 (file)
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -268,6 +268,7 @@ struct svc_rqst {
                                                  * cache pages */
  #define        RQ_VICTIM       (5)                     /* about to be shut down */
  #define        RQ_BUSY         (6)                     /* request is busy */
+#define        RQ_DATA         (7)                     /* request has data */
         unsigned long           rq_flags;       /* flags field */
  
         void *                  rq_argp;        /* decoded arguments */
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h

index 79ba50856707b9b9a619be78335b27b383312ed9..ab02a457da1fa8aea378889394730c708b03e89b 100644 (file)
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -25,7 +25,6 @@ struct svc_xprt_ops {
         void            (*xpo_detach)(struct svc_xprt *);
         void            (*xpo_free)(struct svc_xprt *);
         int             (*xpo_secure_port)(struct svc_rqst *);
-       void            (*xpo_adjust_wspace)(struct svc_xprt *);
  };
  
  struct svc_xprt_class {
@@ -69,6 +68,7 @@ struct svc_xprt {
  
         struct svc_serv         *xpt_server;    /* service for transport */
         atomic_t                xpt_reserved;   /* space on outq that is rsvd */
+       atomic_t                xpt_nr_rqsts;   /* Number of requests */
         struct mutex            xpt_mutex;      /* to serialize sending data */
         spinlock_t              xpt_lock;       /* protects sk_deferred
                                                  * and xpt_auth_cache */
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h

index 384041669489e196a1732b6419520b2a054ea651..5ee7aab95eb8499239ae2f74b0534dd393290a95 100644 (file)
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -94,6 +94,19 @@ enum ib_sa_selector {
         IB_SA_BEST = 3
  };
  
+/*
+ * There are 4 types of join states:
+ * FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember.
+ * The order corresponds to JoinState bits in MCMemberRecord.
+ */
+enum ib_sa_mc_join_states {
+       FULLMEMBER_JOIN,
+       NONMEMBER_JOIN,
+       SENDONLY_NONMEBER_JOIN,
+       SENDONLY_FULLMEMBER_JOIN,
+       NUM_JOIN_MEMBERSHIP_TYPES,
+};
+
  #define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT      BIT(12)
  
  /*
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h

index a8137dcf5a0072f10468c326a2581674b2e16369..8e90dd28bb7536d16058d711b096f8125bd42874 100644 (file)
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -562,6 +562,7 @@ enum ib_event_type {
         IB_EVENT_QP_LAST_WQE_REACHED,
         IB_EVENT_CLIENT_REREGISTER,
         IB_EVENT_GID_CHANGE,
+       IB_EVENT_WQ_FATAL,
  };
  
  const char *__attribute_const__ ib_event_msg(enum ib_event_type event);
@@ -572,6 +573,7 @@ struct ib_event {
                 struct ib_cq    *cq;
                 struct ib_qp    *qp;
                 struct ib_srq   *srq;
+               struct ib_wq    *wq;
                 u8              port_num;
         } element;
         enum ib_event_type      event;
@@ -1015,6 +1017,7 @@ struct ib_qp_init_attr {
          * Only needed for special QP types, or when using the RW API.
          */
         u8                      port_num;
+       struct ib_rwq_ind_table *rwq_ind_tbl;
  };
  
  struct ib_qp_open_attr {
@@ -1323,6 +1326,8 @@ struct ib_ucontext {
         struct list_head        ah_list;
         struct list_head        xrcd_list;
         struct list_head        rule_list;
+       struct list_head        wq_list;
+       struct list_head        rwq_ind_tbl_list;
         int                     closing;
  
         struct pid             *tgid;
@@ -1428,6 +1433,67 @@ struct ib_srq {
         } ext;
  };
  
+enum ib_wq_type {
+       IB_WQT_RQ
+};
+
+enum ib_wq_state {
+       IB_WQS_RESET,
+       IB_WQS_RDY,
+       IB_WQS_ERR
+};
+
+struct ib_wq {
+       struct ib_device       *device;
+       struct ib_uobject      *uobject;
+       void                *wq_context;
+       void                (*event_handler)(struct ib_event *, void *);
+       struct ib_pd           *pd;
+       struct ib_cq           *cq;
+       u32             wq_num;
+       enum ib_wq_state       state;
+       enum ib_wq_type wq_type;
+       atomic_t                usecnt;
+};
+
+struct ib_wq_init_attr {
+       void                   *wq_context;
+       enum ib_wq_type wq_type;
+       u32             max_wr;
+       u32             max_sge;
+       struct  ib_cq          *cq;
+       void                (*event_handler)(struct ib_event *, void *);
+};
+
+enum ib_wq_attr_mask {
+       IB_WQ_STATE     = 1 << 0,
+       IB_WQ_CUR_STATE = 1 << 1,
+};
+
+struct ib_wq_attr {
+       enum    ib_wq_state     wq_state;
+       enum    ib_wq_state     curr_wq_state;
+};
+
+struct ib_rwq_ind_table {
+       struct ib_device        *device;
+       struct ib_uobject      *uobject;
+       atomic_t                usecnt;
+       u32             ind_tbl_num;
+       u32             log_ind_tbl_size;
+       struct ib_wq    **ind_tbl;
+};
+
+struct ib_rwq_ind_table_init_attr {
+       u32             log_ind_tbl_size;
+       /* Each entry is a pointer to Receive Work Queue */
+       struct ib_wq    **ind_tbl;
+};
+
+/*
+ * @max_write_sge: Maximum SGE elements per RDMA WRITE request.
+ * @max_read_sge:  Maximum SGE elements per RDMA READ request.
+ */
  struct ib_qp {
         struct ib_device       *device;
         struct ib_pd           *pd;
@@ -1449,7 +1515,10 @@ struct ib_qp {
         void                  (*event_handler)(struct ib_event *, void *);
         void                   *qp_context;
         u32                     qp_num;
+       u32                     max_write_sge;
+       u32                     max_read_sge;
         enum ib_qp_type         qp_type;
+       struct ib_rwq_ind_table *rwq_ind_tbl;
  };
  
  struct ib_mr {
@@ -1506,6 +1575,7 @@ enum ib_flow_spec_type {
         IB_FLOW_SPEC_IB         = 0x22,
         /* L3 header*/
         IB_FLOW_SPEC_IPV4       = 0x30,
+       IB_FLOW_SPEC_IPV6       = 0x31,
         /* L4 headers*/
         IB_FLOW_SPEC_TCP        = 0x40,
         IB_FLOW_SPEC_UDP        = 0x41
@@ -1567,6 +1637,18 @@ struct ib_flow_spec_ipv4 {
         struct ib_flow_ipv4_filter mask;
  };
  
+struct ib_flow_ipv6_filter {
+       u8      src_ip[16];
+       u8      dst_ip[16];
+};
+
+struct ib_flow_spec_ipv6 {
+       enum ib_flow_spec_type     type;
+       u16                        size;
+       struct ib_flow_ipv6_filter val;
+       struct ib_flow_ipv6_filter mask;
+};
+
  struct ib_flow_tcp_udp_filter {
         __be16  dst_port;
         __be16  src_port;
@@ -1588,6 +1670,7 @@ union ib_flow_spec {
         struct ib_flow_spec_ib          ib;
         struct ib_flow_spec_ipv4        ipv4;
         struct ib_flow_spec_tcp_udp     tcp_udp;
+       struct ib_flow_spec_ipv6        ipv6;
  };
  
  struct ib_flow_attr {
@@ -1921,7 +2004,18 @@ struct ib_device {
                                                    struct ifla_vf_stats *stats);
         int                        (*set_vf_guid)(struct ib_device *device, int vf, u8 port, u64 guid,
                                                   int type);
-
+       struct ib_wq *             (*create_wq)(struct ib_pd *pd,
+                                               struct ib_wq_init_attr *init_attr,
+                                               struct ib_udata *udata);
+       int                        (*destroy_wq)(struct ib_wq *wq);
+       int                        (*modify_wq)(struct ib_wq *wq,
+                                               struct ib_wq_attr *attr,
+                                               u32 wq_attr_mask,
+                                               struct ib_udata *udata);
+       struct ib_rwq_ind_table *  (*create_rwq_ind_table)(struct ib_device *device,
+                                                          struct ib_rwq_ind_table_init_attr *init_attr,
+                                                          struct ib_udata *udata);
+       int                        (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table);
         struct ib_dma_mapping_ops   *dma_ops;
  
         struct module               *owner;
@@ -1956,6 +2050,7 @@ struct ib_device {
          * in fast paths.
          */
         int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *);
+       void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len);
  };
  
  struct ib_client {
@@ -1991,6 +2086,8 @@ struct ib_client {
  struct ib_device *ib_alloc_device(size_t size);
  void ib_dealloc_device(struct ib_device *device);
  
+void ib_get_device_fw_str(struct ib_device *device, char *str, size_t str_len);
+
  int ib_register_device(struct ib_device *device,
                        int (*port_callback)(struct ib_device *,
                                             u8, struct kobject *));
@@ -3168,6 +3265,15 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
  struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
                                             u16 pkey, const union ib_gid *gid,
                                             const struct sockaddr *addr);
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+                          struct ib_wq_init_attr *init_attr);
+int ib_destroy_wq(struct ib_wq *wq);
+int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *attr,
+                u32 wq_attr_mask);
+struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
+                                                struct ib_rwq_ind_table_init_attr*
+                                                wq_ind_table_init_attr);
+int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
  
  int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
                  unsigned int *sg_offset, unsigned int page_size);
diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h

index 2b95c2c336eb85db88c1054f991547c11a11ec4e..9303e0e4f508d8c524a34b979da7a359cb15b30c 100644 (file)
--- a/include/rdma/opa_port_info.h
+++ b/include/rdma/opa_port_info.h
@@ -33,11 +33,6 @@
  #if !defined(OPA_PORT_INFO_H)
  #define OPA_PORT_INFO_H
  
-/* Temporary until HFI driver is updated */
-#ifndef USE_PI_LED_ENABLE
-#define USE_PI_LED_ENABLE 0
-#endif
-
  #define OPA_PORT_LINK_MODE_NOP 0               /* No change */
  #define OPA_PORT_LINK_MODE_OPA 4               /* Port mode is OPA */
  
@@ -274,23 +269,12 @@ enum port_info_field_masks {
         OPA_PI_MASK_MTU_CAP                       = 0x0F,
  };
  
-#if USE_PI_LED_ENABLE
  struct opa_port_states {
         u8     reserved;
         u8     ledenable_offlinereason;   /* 1 res, 1 bit, 6 bits */
         u8     reserved2;
         u8     portphysstate_portstate;   /* 4 bits, 4 bits */
  };
-#define PI_LED_ENABLE_SUP 1
-#else
-struct opa_port_states {
-       u8     reserved;
-       u8     offline_reason;            /* 2 res, 6 bits */
-       u8     reserved2;
-       u8     portphysstate_portstate;   /* 4 bits, 4 bits */
-};
-#define PI_LED_ENABLE_SUP 0
-#endif
  
  struct opa_port_state_info {
         struct opa_port_states port_states;
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h

index afe44fde72a56599cb0dd1674078210a78438ae4..81fb1d15e8bb1b64d6fa7b9ba4f5eb095604738b 100644 (file)
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -333,11 +333,13 @@ int rdma_disconnect(struct rdma_cm_id *id);
   *   address.
   * @id: Communication identifier associated with the request.
   * @addr: Multicast address identifying the group to join.
+ * @join_state: Multicast JoinState bitmap requested by port.
+ *             Bitmap is based on IB_SA_MCMEMBER_REC_JOIN_STATE bits.
   * @context: User-defined context associated with the join request, returned
   * to the user through the private_data pointer in multicast events.
   */
  int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
-                       void *context);
+                       u8 join_state, void *context);
  
  /**
   * rdma_leave_multicast - Leave the multicast group specified by the given
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h

index 9c9a27d42aaa5e89aa778d3597cb54b13c270ecc..e31502107a58ca115e0ac1543899ed728f792fc4 100644 (file)
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -158,6 +158,7 @@ struct rvt_driver_params {
         u32 max_mad_size;
         u8 qos_shift;
         u8 max_rdma_atomic;
+       u8 reserved_operations;
  };
  
  /* Protection domain */
@@ -351,6 +352,9 @@ struct rvt_dev_info {
         /* Driver specific properties */
         struct rvt_driver_params dparms;
  
+       /* post send table */
+       const struct rvt_operation_params *post_parms;
+
         struct rvt_mregion __rcu *dma_mr;
         struct rvt_lkey_table lkey_table;
  
@@ -484,6 +488,9 @@ void rvt_unregister_device(struct rvt_dev_info *rvd);
  int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
  int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port,
                   int port_index, u16 *pkey_table);
+int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
+                   int access);
+int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey);
  int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
                 u32 len, u64 vaddr, u32 rkey, int acc);
  int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
diff --git a/include/rdma/rdmavt_mr.h b/include/rdma/rdmavt_mr.h

index 5edffdca8c53b201a78bb503a6c6b63381672652..6b3c6c8b6b772a1774012d5b4f32b827a03f5537 100644 (file)
--- a/include/rdma/rdmavt_mr.h
+++ b/include/rdma/rdmavt_mr.h
@@ -81,6 +81,7 @@ struct rvt_mregion {
         u32 mapsz;              /* size of the map array */
         u8  page_shift;         /* 0 - non unform/non powerof2 sizes */
         u8  lkey_published;     /* in global table */
+       atomic_t lkey_invalid;  /* true if current lkey is invalid */
         struct completion comp; /* complete when refcount goes to zero */
         atomic_t refcount;
         struct rvt_segarray *map[0];    /* the segments */
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h

index 6d23b879416ac0f5249f33d7fd4713323e704961..bd34d0b56bf770feea3c7d4822cf8e3de8ec0fd3 100644 (file)
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -144,6 +144,12 @@
  #define RVT_PROCESS_OR_FLUSH_SEND \
         (RVT_PROCESS_SEND_OK | RVT_FLUSH_SEND)
  
+/*
+ * Internal send flags
+ */
+#define RVT_SEND_RESERVE_USED           IB_SEND_RESERVED_START
+#define RVT_SEND_COMPLETION_ONLY       (IB_SEND_RESERVED_START << 1)
+
  /*
   * Send work request queue entry.
   * The size of the sg_list is determined when the QP is created and stored
@@ -216,23 +222,43 @@ struct rvt_mmap_info {
   * to send a RDMA read response or atomic operation.
   */
  struct rvt_ack_entry {
-       u8 opcode;
-       u8 sent;
+       struct rvt_sge rdma_sge;
+       u64 atomic_data;
         u32 psn;
         u32 lpsn;
-       union {
-               struct rvt_sge rdma_sge;
-               u64 atomic_data;
-       };
+       u8 opcode;
+       u8 sent;
  };
  
  #define        RC_QP_SCALING_INTERVAL  5
  
-/*
- * Variables prefixed with s_ are for the requester (sender).
- * Variables prefixed with r_ are for the responder (receiver).
- * Variables prefixed with ack_ are for responder replies.
+#define RVT_OPERATION_PRIV        0x00000001
+#define RVT_OPERATION_ATOMIC      0x00000002
+#define RVT_OPERATION_ATOMIC_SGE  0x00000004
+#define RVT_OPERATION_LOCAL       0x00000008
+#define RVT_OPERATION_USE_RESERVE 0x00000010
+
+#define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1)
+
+/**
+ * rvt_operation_params - op table entry
+ * @length - the length to copy into the swqe entry
+ * @qpt_support - a bit mask indicating QP type support
+ * @flags - RVT_OPERATION flags (see above)
   *
+ * This supports table driven post send so that
+ * the driver can have differing an potentially
+ * different sets of operations.
+ *
+ **/
+
+struct rvt_operation_params {
+       size_t length;
+       u32 qpt_support;
+       u32 flags;
+};
+
+/*
   * Common variables are protected by both r_rq.lock and s_lock in that order
   * which only happens in modify_qp() or changing the QP 'state'.
   */
@@ -307,6 +333,7 @@ struct rvt_qp {
         u32 s_next_psn;         /* PSN for next request */
         u32 s_avail;            /* number of entries avail */
         u32 s_ssn;              /* SSN of tail entry */
+       atomic_t s_reserved_used; /* reserved entries in use */
  
         spinlock_t s_lock ____cacheline_aligned_in_smp;
         u32 s_flags;
@@ -343,6 +370,8 @@ struct rvt_qp {
         struct rvt_sge_state s_ack_rdma_sge;
         struct timer_list s_timer;
  
+       atomic_t local_ops_pending; /* number of fast_reg/local_inv reqs */
+
         /*
          * This sge list MUST be last. Do not add anything below here.
          */
@@ -436,6 +465,49 @@ static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n)
                   rq->max_sge * sizeof(struct ib_sge)) * n);
  }
  
+/**
+ * rvt_qp_wqe_reserve - reserve operation
+ * @qp - the rvt qp
+ * @wqe - the send wqe
+ *
+ * This routine used in post send to record
+ * a wqe relative reserved operation use.
+ */
+static inline void rvt_qp_wqe_reserve(
+       struct rvt_qp *qp,
+       struct rvt_swqe *wqe)
+{
+       wqe->wr.send_flags |= RVT_SEND_RESERVE_USED;
+       atomic_inc(&qp->s_reserved_used);
+}
+
+/**
+ * rvt_qp_wqe_unreserve - clean reserved operation
+ * @qp - the rvt qp
+ * @wqe - the send wqe
+ *
+ * This decrements the reserve use count.
+ *
+ * This call MUST precede the change to
+ * s_last to insure that post send sees a stable
+ * s_avail.
+ *
+ * An smp_mp__after_atomic() is used to insure
+ * the compiler does not juggle the order of the s_last
+ * ring index and the decrementing of s_reserved_used.
+ */
+static inline void rvt_qp_wqe_unreserve(
+       struct rvt_qp *qp,
+       struct rvt_swqe *wqe)
+{
+       if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) {
+               wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED;
+               atomic_dec(&qp->s_reserved_used);
+               /* insure no compiler re-order up to s_last change */
+               smp_mb__after_atomic();
+       }
+}
+
  extern const int  ib_rvt_state_ops[];
  
  struct rvt_dev_info;
diff --git a/include/scsi/viosrp.h b/include/scsi/viosrp.h

new file mode 100644 (file)

index 0000000..974e07b
--- /dev/null
+++ b/include/scsi/viosrp.h
@@ -0,0 +1,220 @@
+/*****************************************************************************/
+/* srp.h -- SCSI RDMA Protocol definitions                                   */
+/*                                                                           */
+/* Written By: Colin Devilbis, IBM Corporation                               */
+/*                                                                           */
+/* Copyright (C) 2003 IBM Corporation                                        */
+/*                                                                           */
+/* This program is free software; you can redistribute it and/or modify      */
+/* it under the terms of the GNU General Public License as published by      */
+/* the Free Software Foundation; either version 2 of the License, or         */
+/* (at your option) any later version.                                       */
+/*                                                                           */
+/* This program is distributed in the hope that it will be useful,           */
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of            */
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             */
+/* GNU General Public License for more details.                              */
+/*                                                                           */
+/* This file contains structures and definitions for IBM RPA (RS/6000        */
+/* platform architecture) implementation of the SRP (SCSI RDMA Protocol)     */
+/* standard.  SRP is used on IBM iSeries and pSeries platforms to send SCSI  */
+/* commands between logical partitions.                                      */
+/*                                                                           */
+/* SRP Information Units (IUs) are sent on a "Command/Response Queue" (CRQ)  */
+/* between partitions.  The definitions in this file are architected,        */
+/* and cannot be changed without breaking compatibility with other versions  */
+/* of Linux and other operating systems (AIX, OS/400) that talk this protocol*/
+/* between logical partitions                                                */
+/*****************************************************************************/
+#ifndef VIOSRP_H
+#define VIOSRP_H
+#include <scsi/srp.h>
+
+#define SRP_VERSION "16.a"
+#define SRP_MAX_IU_LEN 256
+#define SRP_MAX_LOC_LEN 32
+
+union srp_iu {
+       struct srp_login_req login_req;
+       struct srp_login_rsp login_rsp;
+       struct srp_login_rej login_rej;
+       struct srp_i_logout i_logout;
+       struct srp_t_logout t_logout;
+       struct srp_tsk_mgmt tsk_mgmt;
+       struct srp_cmd cmd;
+       struct srp_rsp rsp;
+       u8 reserved[SRP_MAX_IU_LEN];
+};
+
+enum viosrp_crq_headers {
+       VIOSRP_CRQ_FREE = 0x00,
+       VIOSRP_CRQ_CMD_RSP = 0x80,
+       VIOSRP_CRQ_INIT_RSP = 0xC0,
+       VIOSRP_CRQ_XPORT_EVENT = 0xFF
+};
+
+enum viosrp_crq_init_formats {
+       VIOSRP_CRQ_INIT = 0x01,
+       VIOSRP_CRQ_INIT_COMPLETE = 0x02
+};
+
+enum viosrp_crq_formats {
+       VIOSRP_SRP_FORMAT = 0x01,
+       VIOSRP_MAD_FORMAT = 0x02,
+       VIOSRP_OS400_FORMAT = 0x03,
+       VIOSRP_AIX_FORMAT = 0x04,
+       VIOSRP_LINUX_FORMAT = 0x05,
+       VIOSRP_INLINE_FORMAT = 0x06
+};
+
+enum viosrp_crq_status {
+       VIOSRP_OK = 0x0,
+       VIOSRP_NONRECOVERABLE_ERR = 0x1,
+       VIOSRP_VIOLATES_MAX_XFER = 0x2,
+       VIOSRP_PARTNER_PANIC = 0x3,
+       VIOSRP_DEVICE_BUSY = 0x8,
+       VIOSRP_ADAPTER_FAIL = 0x10,
+       VIOSRP_OK2 = 0x99,
+};
+
+struct viosrp_crq {
+       u8 valid;               /* used by RPA */
+       u8 format;              /* SCSI vs out-of-band */
+       u8 reserved;
+       u8 status;              /* non-scsi failure? (e.g. DMA failure) */
+       __be16 timeout;         /* in seconds */
+       __be16 IU_length;               /* in bytes */
+       __be64 IU_data_ptr;     /* the TCE for transferring data */
+};
+
+/* MADs are Management requests above and beyond the IUs defined in the SRP
+ * standard.
+ */
+enum viosrp_mad_types {
+       VIOSRP_EMPTY_IU_TYPE = 0x01,
+       VIOSRP_ERROR_LOG_TYPE = 0x02,
+       VIOSRP_ADAPTER_INFO_TYPE = 0x03,
+       VIOSRP_CAPABILITIES_TYPE = 0x05,
+       VIOSRP_ENABLE_FAST_FAIL = 0x08,
+};
+
+enum viosrp_mad_status {
+       VIOSRP_MAD_SUCCESS = 0x00,
+       VIOSRP_MAD_NOT_SUPPORTED = 0xF1,
+       VIOSRP_MAD_FAILED = 0xF7,
+};
+
+enum viosrp_capability_type {
+       MIGRATION_CAPABILITIES = 0x01,
+       RESERVATION_CAPABILITIES = 0x02,
+};
+
+enum viosrp_capability_support {
+       SERVER_DOES_NOT_SUPPORTS_CAP = 0x0,
+       SERVER_SUPPORTS_CAP = 0x01,
+       SERVER_CAP_DATA = 0x02,
+};
+
+enum viosrp_reserve_type {
+       CLIENT_RESERVE_SCSI_2 = 0x01,
+};
+
+enum viosrp_capability_flag {
+       CLIENT_MIGRATED = 0x01,
+       CLIENT_RECONNECT = 0x02,
+       CAP_LIST_SUPPORTED = 0x04,
+       CAP_LIST_DATA = 0x08,
+};
+
+/*
+ * Common MAD header
+ */
+struct mad_common {
+       __be32 type;
+       __be16 status;
+       __be16 length;
+       __be64 tag;
+};
+
+/*
+ * All SRP (and MAD) requests normally flow from the
+ * client to the server.  There is no way for the server to send
+ * an asynchronous message back to the client.  The Empty IU is used
+ * to hang out a meaningless request to the server so that it can respond
+ * asynchrouously with something like a SCSI AER
+ */
+struct viosrp_empty_iu {
+       struct mad_common common;
+       __be64 buffer;
+       __be32 port;
+};
+
+struct viosrp_error_log {
+       struct mad_common common;
+       __be64 buffer;
+};
+
+struct viosrp_adapter_info {
+       struct mad_common common;
+       __be64 buffer;
+};
+
+struct viosrp_fast_fail {
+       struct mad_common common;
+};
+
+struct viosrp_capabilities {
+       struct mad_common common;
+       __be64 buffer;
+};
+
+struct mad_capability_common {
+       __be32 cap_type;
+       __be16 length;
+       __be16 server_support;
+};
+
+struct mad_reserve_cap {
+       struct mad_capability_common common;
+       __be32 type;
+};
+
+struct mad_migration_cap {
+       struct mad_capability_common common;
+       __be32 ecl;
+};
+
+struct capabilities {
+       __be32 flags;
+       char name[SRP_MAX_LOC_LEN];
+       char loc[SRP_MAX_LOC_LEN];
+       struct mad_migration_cap migration;
+       struct mad_reserve_cap reserve;
+};
+
+union mad_iu {
+       struct viosrp_empty_iu empty_iu;
+       struct viosrp_error_log error_log;
+       struct viosrp_adapter_info adapter_info;
+       struct viosrp_fast_fail fast_fail;
+       struct viosrp_capabilities capabilities;
+};
+
+union viosrp_iu {
+       union srp_iu srp;
+       union mad_iu mad;
+};
+
+struct mad_adapter_info_data {
+       char srp_version[8];
+       char partition_name[96];
+       __be32 partition_number;
+#define SRP_MAD_VERSION_1 1
+       __be32 mad_version;
+#define SRP_MAD_OS_LINUX 2
+#define SRP_MAD_OS_AIX 3
+       __be32 os_type;
+       __be32 port_max_txu[8]; /* per-port maximum transfer */
+};
+
+#endif
diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h

index d8ab5101fad5533876339f788b75d855d382bc4f..f6f3bc52c1ac2e21611ba7be2a274c7cb442166d 100644 (file)
--- a/include/target/target_core_backend.h
+++ b/include/target/target_core_backend.h
@@ -95,6 +95,6 @@ sense_reason_t passthrough_parse_cdb(struct se_cmd *cmd,
  bool target_sense_desc_format(struct se_device *dev);
  sector_t target_to_linux_sector(struct se_device *dev, sector_t lb);
  bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
-                                      struct request_queue *q, int block_size);
+                                      struct request_queue *q);
  
  #endif /* TARGET_CORE_BACKEND_H */
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h

index b316b44d03f3667fea87daef334a3e5a5060f5bc..fb8e3b6febdff7f5fdb3f0335455b9e074647c40 100644 (file)
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -142,6 +142,7 @@ enum se_cmd_flags_table {
         SCF_PASSTHROUGH_PROT_SG_TO_MEM_NOALLOC = 0x00200000,
         SCF_ACK_KREF                    = 0x00400000,
         SCF_USE_CPUID                   = 0x00800000,
+       SCF_TASK_ATTR_SET               = 0x01000000,
  };
  
  /*
diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h

index de44462a7680c2071a4f37088e7241209de117f3..5cd6faa6e0d166ed07444cf5e3735e2626483205 100644 (file)
--- a/include/target/target_core_fabric.h
+++ b/include/target/target_core_fabric.h
@@ -163,7 +163,6 @@ int core_tmr_alloc_req(struct se_cmd *, void *, u8, gfp_t);
  void   core_tmr_release_req(struct se_tmr_req *);
  int    transport_generic_handle_tmr(struct se_cmd *);
  void   transport_generic_request_failure(struct se_cmd *, sense_reason_t);
-void   __target_execute_cmd(struct se_cmd *);
  int    transport_lookup_tmr_lun(struct se_cmd *, u64);
  void   core_allocate_nexus_loss_ua(struct se_node_acl *acl);
  
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h

index 5b81ef304388ed537033e7c4b67850e1385f5e0e..e030d6f6c19acb090348298e664739a8cb06c111 100644 (file)
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -66,6 +66,21 @@ struct btrfs_qgroup_extent_record;
         { BTRFS_BLOCK_GROUP_RAID6,      "RAID6"}
  
  #define BTRFS_UUID_SIZE 16
+#define TP_STRUCT__entry_fsid __array(u8, fsid, BTRFS_UUID_SIZE)
+
+#define TP_fast_assign_fsid(fs_info)                                   \
+       memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE)
+
+#define TP_STRUCT__entry_btrfs(args...)                                        \
+       TP_STRUCT__entry(                                               \
+               TP_STRUCT__entry_fsid                                   \
+               args)
+#define TP_fast_assign_btrfs(fs_info, args...)                         \
+       TP_fast_assign(                                                 \
+               TP_fast_assign_fsid(fs_info);                           \
+               args)
+#define TP_printk_btrfs(fmt, args...) \
+       TP_printk("%pU: " fmt, __entry->fsid, args)
  
  TRACE_EVENT(btrfs_transaction_commit,
  
@@ -73,17 +88,17 @@ TRACE_EVENT(btrfs_transaction_commit,
  
         TP_ARGS(root),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,  generation                )
                 __field(        u64,  root_objectid             )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(root->fs_info,
                 __entry->generation     = root->fs_info->generation;
                 __entry->root_objectid  = root->root_key.objectid;
         ),
  
-       TP_printk("root = %llu(%s), gen = %llu",
+       TP_printk_btrfs("root = %llu(%s), gen = %llu",
                   show_root_type(__entry->root_objectid),
                   (unsigned long long)__entry->generation)
  );
@@ -94,7 +109,7 @@ DECLARE_EVENT_CLASS(btrfs__inode,
  
         TP_ARGS(inode),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        ino_t,  ino                     )
                 __field(        blkcnt_t,  blocks               )
                 __field(        u64,  disk_i_size               )
@@ -104,7 +119,7 @@ DECLARE_EVENT_CLASS(btrfs__inode,
                 __field(        u64,  root_objectid             )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
                 __entry->ino    = inode->i_ino;
                 __entry->blocks = inode->i_blocks;
                 __entry->disk_i_size  = BTRFS_I(inode)->disk_i_size;
@@ -115,7 +130,7 @@ DECLARE_EVENT_CLASS(btrfs__inode,
                                 BTRFS_I(inode)->root->root_key.objectid;
         ),
  
-       TP_printk("root = %llu(%s), gen = %llu, ino = %lu, blocks = %llu, "
+       TP_printk_btrfs("root = %llu(%s), gen = %llu, ino = %lu, blocks = %llu, "
                   "disk_i_size = %llu, last_trans = %llu, logged_trans = %llu",
                   show_root_type(__entry->root_objectid),
                   (unsigned long long)__entry->generation,
@@ -175,7 +190,7 @@ TRACE_EVENT_CONDITION(btrfs_get_extent,
  
         TP_CONDITION(map),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,  root_objectid     )
                 __field(        u64,  start             )
                 __field(        u64,  len               )
@@ -187,7 +202,7 @@ TRACE_EVENT_CONDITION(btrfs_get_extent,
                 __field(        unsigned int,  compress_type    )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(root->fs_info,
                 __entry->root_objectid  = root->root_key.objectid;
                 __entry->start          = map->start;
                 __entry->len            = map->len;
@@ -199,7 +214,7 @@ TRACE_EVENT_CONDITION(btrfs_get_extent,
                 __entry->compress_type  = map->compress_type;
         ),
  
-       TP_printk("root = %llu(%s), start = %llu, len = %llu, "
+       TP_printk_btrfs("root = %llu(%s), start = %llu, len = %llu, "
                   "orig_start = %llu, block_start = %llu(%s), "
                   "block_len = %llu, flags = %s, refs = %u, "
                   "compress_type = %u",
@@ -233,7 +248,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
  
         TP_ARGS(inode, ordered),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        ino_t,  ino             )
                 __field(        u64,  file_offset       )
                 __field(        u64,  start             )
@@ -246,7 +261,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
                 __field(        u64,  root_objectid     )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
                 __entry->ino            = inode->i_ino;
                 __entry->file_offset    = ordered->file_offset;
                 __entry->start          = ordered->start;
@@ -260,7 +275,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
                                 BTRFS_I(inode)->root->root_key.objectid;
         ),
  
-       TP_printk("root = %llu(%s), ino = %llu, file_offset = %llu, "
+       TP_printk_btrfs("root = %llu(%s), ino = %llu, file_offset = %llu, "
                   "start = %llu, len = %llu, disk_len = %llu, "
                   "bytes_left = %llu, flags = %s, compress_type = %d, "
                   "refs = %d",
@@ -310,7 +325,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
  
         TP_ARGS(page, inode, wbc),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        ino_t,  ino                     )
                 __field(        pgoff_t,  index                 )
                 __field(        long,   nr_to_write             )
@@ -324,7 +339,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
                 __field(        u64,    root_objectid           )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
                 __entry->ino            = inode->i_ino;
                 __entry->index          = page->index;
                 __entry->nr_to_write    = wbc->nr_to_write;
@@ -339,7 +354,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
                                  BTRFS_I(inode)->root->root_key.objectid;
         ),
  
-       TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, "
+       TP_printk_btrfs("root = %llu(%s), ino = %lu, page_index = %lu, "
                   "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, "
                   "range_end = %llu, for_kupdate = %d, "
                   "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu",
@@ -366,7 +381,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
  
         TP_ARGS(page, start, end, uptodate),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        ino_t,   ino            )
                 __field(        pgoff_t, index          )
                 __field(        u64,     start          )
@@ -375,7 +390,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
                 __field(        u64,    root_objectid   )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(btrfs_sb(page->mapping->host->i_sb),
                 __entry->ino    = page->mapping->host->i_ino;
                 __entry->index  = page->index;
                 __entry->start  = start;
@@ -385,7 +400,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
                          BTRFS_I(page->mapping->host)->root->root_key.objectid;
         ),
  
-       TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, start = %llu, "
+       TP_printk_btrfs("root = %llu(%s), ino = %lu, page_index = %lu, start = %llu, "
                   "end = %llu, uptodate = %d",
                   show_root_type(__entry->root_objectid),
                   (unsigned long)__entry->ino, (unsigned long)__entry->index,
@@ -399,7 +414,7 @@ TRACE_EVENT(btrfs_sync_file,
  
         TP_ARGS(file, datasync),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        ino_t,  ino             )
                 __field(        ino_t,  parent          )
                 __field(        int,    datasync        )
@@ -410,6 +425,7 @@ TRACE_EVENT(btrfs_sync_file,
                 struct dentry *dentry = file->f_path.dentry;
                 struct inode *inode = d_inode(dentry);
  
+               TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb));
                 __entry->ino            = inode->i_ino;
                 __entry->parent         = d_inode(dentry->d_parent)->i_ino;
                 __entry->datasync       = datasync;
@@ -417,7 +433,7 @@ TRACE_EVENT(btrfs_sync_file,
                                  BTRFS_I(inode)->root->root_key.objectid;
         ),
  
-       TP_printk("root = %llu(%s), ino = %ld, parent = %ld, datasync = %d",
+       TP_printk_btrfs("root = %llu(%s), ino = %ld, parent = %ld, datasync = %d",
                   show_root_type(__entry->root_objectid),
                   (unsigned long)__entry->ino, (unsigned long)__entry->parent,
                   __entry->datasync)
@@ -425,19 +441,19 @@ TRACE_EVENT(btrfs_sync_file,
  
  TRACE_EVENT(btrfs_sync_fs,
  
-       TP_PROTO(int wait),
+       TP_PROTO(struct btrfs_fs_info *fs_info, int wait),
  
-       TP_ARGS(wait),
+       TP_ARGS(fs_info, wait),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        int,  wait              )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(fs_info,
                 __entry->wait   = wait;
         ),
  
-       TP_printk("wait = %d", __entry->wait)
+       TP_printk_btrfs("wait = %d", __entry->wait)
  );
  
  TRACE_EVENT(btrfs_add_block_group,
@@ -490,13 +506,14 @@ TRACE_EVENT(btrfs_add_block_group,
  
  DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
  
-       TP_PROTO(struct btrfs_delayed_ref_node *ref,
+       TP_PROTO(struct btrfs_fs_info *fs_info,
+                struct btrfs_delayed_ref_node *ref,
                  struct btrfs_delayed_tree_ref *full_ref,
                  int action),
  
-       TP_ARGS(ref, full_ref, action),
+       TP_ARGS(fs_info, ref, full_ref, action),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,  bytenr            )
                 __field(        u64,  num_bytes         )
                 __field(        int,  action            ) 
@@ -507,7 +524,7 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
                 __field(        u64,  seq               )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(fs_info,
                 __entry->bytenr         = ref->bytenr;
                 __entry->num_bytes      = ref->num_bytes;
                 __entry->action         = action;
@@ -518,7 +535,7 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
                 __entry->seq            = ref->seq;
         ),
  
-       TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
+       TP_printk_btrfs("bytenr = %llu, num_bytes = %llu, action = %s, "
                   "parent = %llu(%s), ref_root = %llu(%s), level = %d, "
                   "type = %s, seq = %llu",
                   (unsigned long long)__entry->bytenr,
@@ -532,31 +549,34 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
  
  DEFINE_EVENT(btrfs_delayed_tree_ref,  add_delayed_tree_ref,
  
-       TP_PROTO(struct btrfs_delayed_ref_node *ref,
+       TP_PROTO(struct btrfs_fs_info *fs_info,
+                struct btrfs_delayed_ref_node *ref,
                  struct btrfs_delayed_tree_ref *full_ref,
                  int action),
  
-       TP_ARGS(ref, full_ref, action)
+       TP_ARGS(fs_info, ref, full_ref, action)
  );
  
  DEFINE_EVENT(btrfs_delayed_tree_ref,  run_delayed_tree_ref,
  
-       TP_PROTO(struct btrfs_delayed_ref_node *ref,
+       TP_PROTO(struct btrfs_fs_info *fs_info,
+                struct btrfs_delayed_ref_node *ref,
                  struct btrfs_delayed_tree_ref *full_ref,
                  int action),
  
-       TP_ARGS(ref, full_ref, action)
+       TP_ARGS(fs_info, ref, full_ref, action)
  );
  
  DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
  
-       TP_PROTO(struct btrfs_delayed_ref_node *ref,
+       TP_PROTO(struct btrfs_fs_info *fs_info,
+                struct btrfs_delayed_ref_node *ref,
                  struct btrfs_delayed_data_ref *full_ref,
                  int action),
  
-       TP_ARGS(ref, full_ref, action),
+       TP_ARGS(fs_info, ref, full_ref, action),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,  bytenr            )
                 __field(        u64,  num_bytes         )
                 __field(        int,  action            ) 
@@ -568,7 +588,7 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
                 __field(        u64,  seq               )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(fs_info,
                 __entry->bytenr         = ref->bytenr;
                 __entry->num_bytes      = ref->num_bytes;
                 __entry->action         = action;
@@ -580,7 +600,7 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
                 __entry->seq            = ref->seq;
         ),
  
-       TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
+       TP_printk_btrfs("bytenr = %llu, num_bytes = %llu, action = %s, "
                   "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, "
                   "offset = %llu, type = %s, seq = %llu",
                   (unsigned long long)__entry->bytenr,
@@ -596,45 +616,48 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
  
  DEFINE_EVENT(btrfs_delayed_data_ref,  add_delayed_data_ref,
  
-       TP_PROTO(struct btrfs_delayed_ref_node *ref,
+       TP_PROTO(struct btrfs_fs_info *fs_info,
+                struct btrfs_delayed_ref_node *ref,
                  struct btrfs_delayed_data_ref *full_ref,
                  int action),
  
-       TP_ARGS(ref, full_ref, action)
+       TP_ARGS(fs_info, ref, full_ref, action)
  );
  
  DEFINE_EVENT(btrfs_delayed_data_ref,  run_delayed_data_ref,
  
-       TP_PROTO(struct btrfs_delayed_ref_node *ref,
+       TP_PROTO(struct btrfs_fs_info *fs_info,
+                struct btrfs_delayed_ref_node *ref,
                  struct btrfs_delayed_data_ref *full_ref,
                  int action),
  
-       TP_ARGS(ref, full_ref, action)
+       TP_ARGS(fs_info, ref, full_ref, action)
  );
  
  DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
  
-       TP_PROTO(struct btrfs_delayed_ref_node *ref,
+       TP_PROTO(struct btrfs_fs_info *fs_info,
+                struct btrfs_delayed_ref_node *ref,
                  struct btrfs_delayed_ref_head *head_ref,
                  int action),
  
-       TP_ARGS(ref, head_ref, action),
+       TP_ARGS(fs_info, ref, head_ref, action),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,  bytenr            )
                 __field(        u64,  num_bytes         )
                 __field(        int,  action            ) 
                 __field(        int,  is_data           )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(fs_info,
                 __entry->bytenr         = ref->bytenr;
                 __entry->num_bytes      = ref->num_bytes;
                 __entry->action         = action;
                 __entry->is_data        = head_ref->is_data;
         ),
  
-       TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, is_data = %d",
+       TP_printk_btrfs("bytenr = %llu, num_bytes = %llu, action = %s, is_data = %d",
                   (unsigned long long)__entry->bytenr,
                   (unsigned long long)__entry->num_bytes,
                   show_ref_action(__entry->action),
@@ -643,20 +666,22 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
  
  DEFINE_EVENT(btrfs_delayed_ref_head,  add_delayed_ref_head,
  
-       TP_PROTO(struct btrfs_delayed_ref_node *ref,
+       TP_PROTO(struct btrfs_fs_info *fs_info,
+                struct btrfs_delayed_ref_node *ref,
                  struct btrfs_delayed_ref_head *head_ref,
                  int action),
  
-       TP_ARGS(ref, head_ref, action)
+       TP_ARGS(fs_info, ref, head_ref, action)
  );
  
  DEFINE_EVENT(btrfs_delayed_ref_head,  run_delayed_ref_head,
  
-       TP_PROTO(struct btrfs_delayed_ref_node *ref,
+       TP_PROTO(struct btrfs_fs_info *fs_info,
+                struct btrfs_delayed_ref_node *ref,
                  struct btrfs_delayed_ref_head *head_ref,
                  int action),
  
-       TP_ARGS(ref, head_ref, action)
+       TP_ARGS(fs_info, ref, head_ref, action)
  );
  
  #define show_chunk_type(type)                                  \
@@ -678,7 +703,7 @@ DECLARE_EVENT_CLASS(btrfs__chunk,
  
         TP_ARGS(root, map, offset, size),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        int,  num_stripes               )
                 __field(        u64,  type                      )
                 __field(        int,  sub_stripes               )
@@ -687,7 +712,7 @@ DECLARE_EVENT_CLASS(btrfs__chunk,
                 __field(        u64,  root_objectid             )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(root->fs_info,
                 __entry->num_stripes    = map->num_stripes;
                 __entry->type           = map->type;
                 __entry->sub_stripes    = map->sub_stripes;
@@ -696,7 +721,7 @@ DECLARE_EVENT_CLASS(btrfs__chunk,
                 __entry->root_objectid  = root->root_key.objectid;
         ),
  
-       TP_printk("root = %llu(%s), offset = %llu, size = %llu, "
+       TP_printk_btrfs("root = %llu(%s), offset = %llu, size = %llu, "
                   "num_stripes = %d, sub_stripes = %d, type = %s",
                   show_root_type(__entry->root_objectid),
                   (unsigned long long)__entry->offset,
@@ -728,7 +753,7 @@ TRACE_EVENT(btrfs_cow_block,
  
         TP_ARGS(root, buf, cow),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,  root_objectid             )
                 __field(        u64,  buf_start                 )
                 __field(        int,  refs                      )
@@ -737,7 +762,7 @@ TRACE_EVENT(btrfs_cow_block,
                 __field(        int,  cow_level                 )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(root->fs_info,
                 __entry->root_objectid  = root->root_key.objectid;
                 __entry->buf_start      = buf->start;
                 __entry->refs           = atomic_read(&buf->refs);
@@ -746,7 +771,7 @@ TRACE_EVENT(btrfs_cow_block,
                 __entry->cow_level      = btrfs_header_level(cow);
         ),
  
-       TP_printk("root = %llu(%s), refs = %d, orig_buf = %llu "
+       TP_printk_btrfs("root = %llu(%s), refs = %d, orig_buf = %llu "
                   "(orig_level = %d), cow_buf = %llu (cow_level = %d)",
                   show_root_type(__entry->root_objectid),
                   __entry->refs,
@@ -763,25 +788,23 @@ TRACE_EVENT(btrfs_space_reservation,
  
         TP_ARGS(fs_info, type, val, bytes, reserve),
  
-       TP_STRUCT__entry(
-               __array(        u8,     fsid,   BTRFS_UUID_SIZE )
+       TP_STRUCT__entry_btrfs(
                 __string(       type,   type                    )
                 __field(        u64,    val                     )
                 __field(        u64,    bytes                   )
                 __field(        int,    reserve                 )
         ),
  
-       TP_fast_assign(
-               memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
+       TP_fast_assign_btrfs(fs_info,
                 __assign_str(type, type);
                 __entry->val            = val;
                 __entry->bytes          = bytes;
                 __entry->reserve        = reserve;
         ),
  
-       TP_printk("%pU: %s: %Lu %s %Lu", __entry->fsid, __get_str(type),
-                 __entry->val, __entry->reserve ? "reserve" : "release",
-                 __entry->bytes)
+       TP_printk_btrfs("%s: %Lu %s %Lu", __get_str(type), __entry->val,
+                       __entry->reserve ? "reserve" : "release",
+                       __entry->bytes)
  );
  
  #define show_flush_action(action)                                              \
@@ -872,22 +895,19 @@ DECLARE_EVENT_CLASS(btrfs__reserved_extent,
  
         TP_ARGS(root, start, len),
  
-       TP_STRUCT__entry(
-               __array(        u8,     fsid,   BTRFS_UUID_SIZE )
-               __field(        u64,    root_objectid           )
-               __field(        u64,    start                   )
-               __field(        u64,    len                     )
+       TP_STRUCT__entry_btrfs(
+               __field(        u64,  root_objectid             )
+               __field(        u64,  start                     )
+               __field(        u64,  len                       )
         ),
  
-       TP_fast_assign(
-               memcpy(__entry->fsid, root->fs_info->fsid, BTRFS_UUID_SIZE);
+       TP_fast_assign_btrfs(root->fs_info,
                 __entry->root_objectid  = root->root_key.objectid;
                 __entry->start          = start;
                 __entry->len            = len;
         ),
  
-       TP_printk("%pU: root = %llu(%s), start = %llu, len = %llu",
-                 __entry->fsid,
+       TP_printk_btrfs("root = %llu(%s), start = %llu, len = %llu",
                   show_root_type(__entry->root_objectid),
                   (unsigned long long)__entry->start,
                   (unsigned long long)__entry->len)
@@ -914,21 +934,21 @@ TRACE_EVENT(find_free_extent,
  
         TP_ARGS(root, num_bytes, empty_size, data),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,    root_objectid           )
                 __field(        u64,    num_bytes               )
                 __field(        u64,    empty_size              )
                 __field(        u64,    data                    )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(root->fs_info,
                 __entry->root_objectid  = root->root_key.objectid;
                 __entry->num_bytes      = num_bytes;
                 __entry->empty_size     = empty_size;
                 __entry->data           = data;
         ),
  
-       TP_printk("root = %Lu(%s), len = %Lu, empty_size = %Lu, "
+       TP_printk_btrfs("root = %Lu(%s), len = %Lu, empty_size = %Lu, "
                   "flags = %Lu(%s)", show_root_type(__entry->root_objectid),
                   __entry->num_bytes, __entry->empty_size, __entry->data,
                   __print_flags((unsigned long)__entry->data, "|",
@@ -943,8 +963,7 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
  
         TP_ARGS(root, block_group, start, len),
  
-       TP_STRUCT__entry(
-               __array(        u8,     fsid,   BTRFS_UUID_SIZE )
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,    root_objectid           )
                 __field(        u64,    bg_objectid             )
                 __field(        u64,    flags                   )
@@ -952,8 +971,7 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
                 __field(        u64,    len                     )
         ),
  
-       TP_fast_assign(
-               memcpy(__entry->fsid, root->fs_info->fsid, BTRFS_UUID_SIZE);
+       TP_fast_assign_btrfs(root->fs_info,
                 __entry->root_objectid  = root->root_key.objectid;
                 __entry->bg_objectid    = block_group->key.objectid;
                 __entry->flags          = block_group->flags;
@@ -961,8 +979,8 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
                 __entry->len            = len;
         ),
  
-       TP_printk("%pU: root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), "
-                 "start = %Lu, len = %Lu", __entry->fsid,
+       TP_printk_btrfs("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), "
+                 "start = %Lu, len = %Lu",
                   show_root_type(__entry->root_objectid), __entry->bg_objectid,
                   __entry->flags, __print_flags((unsigned long)__entry->flags,
                                                 "|", BTRFS_GROUP_FLAGS),
@@ -994,7 +1012,7 @@ TRACE_EVENT(btrfs_find_cluster,
  
         TP_ARGS(block_group, start, bytes, empty_size, min_bytes),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,    bg_objectid             )
                 __field(        u64,    flags                   )
                 __field(        u64,    start                   )
@@ -1003,7 +1021,7 @@ TRACE_EVENT(btrfs_find_cluster,
                 __field(        u64,    min_bytes               )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(block_group->fs_info,
                 __entry->bg_objectid    = block_group->key.objectid;
                 __entry->flags          = block_group->flags;
                 __entry->start          = start;
@@ -1012,7 +1030,7 @@ TRACE_EVENT(btrfs_find_cluster,
                 __entry->min_bytes      = min_bytes;
         ),
  
-       TP_printk("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu,"
+       TP_printk_btrfs("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu,"
                   " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid,
                   __entry->flags,
                   __print_flags((unsigned long)__entry->flags, "|",
@@ -1026,15 +1044,15 @@ TRACE_EVENT(btrfs_failed_cluster_setup,
  
         TP_ARGS(block_group),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,    bg_objectid             )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(block_group->fs_info,
                 __entry->bg_objectid    = block_group->key.objectid;
         ),
  
-       TP_printk("block_group = %Lu", __entry->bg_objectid)
+       TP_printk_btrfs("block_group = %Lu", __entry->bg_objectid)
  );
  
  TRACE_EVENT(btrfs_setup_cluster,
@@ -1044,7 +1062,7 @@ TRACE_EVENT(btrfs_setup_cluster,
  
         TP_ARGS(block_group, cluster, size, bitmap),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,    bg_objectid             )
                 __field(        u64,    flags                   )
                 __field(        u64,    start                   )
@@ -1053,7 +1071,7 @@ TRACE_EVENT(btrfs_setup_cluster,
                 __field(        int,    bitmap                  )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(block_group->fs_info,
                 __entry->bg_objectid    = block_group->key.objectid;
                 __entry->flags          = block_group->flags;
                 __entry->start          = cluster->window_start;
@@ -1062,7 +1080,7 @@ TRACE_EVENT(btrfs_setup_cluster,
                 __entry->bitmap         = bitmap;
         ),
  
-       TP_printk("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, "
+       TP_printk_btrfs("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, "
                   "size = %Lu, max_size = %Lu, bitmap = %d",
                   __entry->bg_objectid,
                   __entry->flags,
@@ -1120,7 +1138,7 @@ DECLARE_EVENT_CLASS(btrfs__work,
  
         TP_ARGS(work),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        void *, work                    )
                 __field(        void *, wq                      )
                 __field(        void *, func                    )
@@ -1129,7 +1147,7 @@ DECLARE_EVENT_CLASS(btrfs__work,
                 __field(        void *, normal_work             )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(btrfs_work_owner(work),
                 __entry->work           = work;
                 __entry->wq             = work->wq;
                 __entry->func           = work->func;
@@ -1138,7 +1156,7 @@ DECLARE_EVENT_CLASS(btrfs__work,
                 __entry->normal_work    = &work->normal_work;
         ),
  
-       TP_printk("work=%p (normal_work=%p), wq=%p, func=%pf, ordered_func=%p,"
+       TP_printk_btrfs("work=%p (normal_work=%p), wq=%p, func=%pf, ordered_func=%p,"
                   " ordered_free=%p",
                   __entry->work, __entry->normal_work, __entry->wq,
                    __entry->func, __entry->ordered_func, __entry->ordered_free)
@@ -1151,15 +1169,15 @@ DECLARE_EVENT_CLASS(btrfs__work__done,
  
         TP_ARGS(work),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        void *, work                    )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(btrfs_work_owner(work),
                 __entry->work           = work;
         ),
  
-       TP_printk("work->%p", __entry->work)
+       TP_printk_btrfs("work->%p", __entry->work)
  );
  
  DEFINE_EVENT(btrfs__work, btrfs_work_queued,
@@ -1196,19 +1214,19 @@ DECLARE_EVENT_CLASS(btrfs__workqueue,
  
         TP_ARGS(wq, name, high),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        void *, wq                      )
                 __string(       name,   name                    )
                 __field(        int ,   high                    )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(btrfs_workqueue_owner(wq),
                 __entry->wq             = wq;
                 __assign_str(name, name);
                 __entry->high           = high;
         ),
  
-       TP_printk("name=%s%s, wq=%p", __get_str(name),
+       TP_printk_btrfs("name=%s%s, wq=%p", __get_str(name),
                   __print_flags(__entry->high, "",
                                 {(WQ_HIGHPRI),  "-high"}),
                   __entry->wq)
@@ -1227,15 +1245,15 @@ DECLARE_EVENT_CLASS(btrfs__workqueue_done,
  
         TP_ARGS(wq),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        void *, wq                      )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(btrfs_workqueue_owner(wq),
                 __entry->wq             = wq;
         ),
  
-       TP_printk("wq=%p", __entry->wq)
+       TP_printk_btrfs("wq=%p", __entry->wq)
  );
  
  DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
@@ -1251,19 +1269,19 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_data_map,
  
         TP_ARGS(inode, free_reserved),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,            rootid          )
                 __field(        unsigned long,  ino             )
                 __field(        u64,            free_reserved   )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
                 __entry->rootid         =       BTRFS_I(inode)->root->objectid;
                 __entry->ino            =       inode->i_ino;
                 __entry->free_reserved  =       free_reserved;
         ),
  
-       TP_printk("rootid=%llu, ino=%lu, free_reserved=%llu",
+       TP_printk_btrfs("rootid=%llu, ino=%lu, free_reserved=%llu",
                   __entry->rootid, __entry->ino, __entry->free_reserved)
  );
  
@@ -1292,7 +1310,7 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
  
         TP_ARGS(inode, start, len, reserved, op),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,            rootid          )
                 __field(        unsigned long,  ino             )
                 __field(        u64,            start           )
@@ -1301,7 +1319,7 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
                 __field(        int,            op              )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
                 __entry->rootid         = BTRFS_I(inode)->root->objectid;
                 __entry->ino            = inode->i_ino;
                 __entry->start          = start;
@@ -1310,7 +1328,7 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
                 __entry->op             = op;
         ),
  
-       TP_printk("root=%llu, ino=%lu, start=%llu, len=%llu, reserved=%llu, op=%s",
+       TP_printk_btrfs("root=%llu, ino=%lu, start=%llu, len=%llu, reserved=%llu, op=%s",
                   __entry->rootid, __entry->ino, __entry->start, __entry->len,
                   __entry->reserved,
                   __print_flags((unsigned long)__entry->op, "",
@@ -1334,86 +1352,90 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
  
  DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
  
-       TP_PROTO(u64 ref_root, u64 reserved),
+       TP_PROTO(struct btrfs_fs_info *fs_info, u64 ref_root, u64 reserved),
  
-       TP_ARGS(ref_root, reserved),
+       TP_ARGS(fs_info, ref_root, reserved),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,            ref_root        )
                 __field(        u64,            reserved        )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(fs_info,
                 __entry->ref_root       = ref_root;
                 __entry->reserved       = reserved;
         ),
  
-       TP_printk("root=%llu, reserved=%llu, op=free",
+       TP_printk_btrfs("root=%llu, reserved=%llu, op=free",
                   __entry->ref_root, __entry->reserved)
  );
  
  DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
  
-       TP_PROTO(u64 ref_root, u64 reserved),
+       TP_PROTO(struct btrfs_fs_info *fs_info, u64 ref_root, u64 reserved),
  
-       TP_ARGS(ref_root, reserved)
+       TP_ARGS(fs_info, ref_root, reserved)
  );
  
  DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
-       TP_PROTO(struct btrfs_qgroup_extent_record *rec),
+       TP_PROTO(struct btrfs_fs_info *fs_info,
+                struct btrfs_qgroup_extent_record *rec),
  
-       TP_ARGS(rec),
+       TP_ARGS(fs_info, rec),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,  bytenr            )
                 __field(        u64,  num_bytes         )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(fs_info,
                 __entry->bytenr         = rec->bytenr,
                 __entry->num_bytes      = rec->num_bytes;
         ),
  
-       TP_printk("bytenr = %llu, num_bytes = %llu",
+       TP_printk_btrfs("bytenr = %llu, num_bytes = %llu",
                   (unsigned long long)__entry->bytenr,
                   (unsigned long long)__entry->num_bytes)
  );
  
  DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents,
  
-       TP_PROTO(struct btrfs_qgroup_extent_record *rec),
+       TP_PROTO(struct btrfs_fs_info *fs_info,
+                struct btrfs_qgroup_extent_record *rec),
  
-       TP_ARGS(rec)
+       TP_ARGS(fs_info, rec)
  );
  
  DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_insert_dirty_extent,
  
-       TP_PROTO(struct btrfs_qgroup_extent_record *rec),
+       TP_PROTO(struct btrfs_fs_info *fs_info,
+                struct btrfs_qgroup_extent_record *rec),
  
-       TP_ARGS(rec)
+       TP_ARGS(fs_info, rec)
  );
  
  TRACE_EVENT(btrfs_qgroup_account_extent,
  
-       TP_PROTO(u64 bytenr, u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots),
+       TP_PROTO(struct btrfs_fs_info *fs_info, u64 bytenr,
+                u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots),
  
-       TP_ARGS(bytenr, num_bytes, nr_old_roots, nr_new_roots),
+       TP_ARGS(fs_info, bytenr, num_bytes, nr_old_roots, nr_new_roots),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,  bytenr                    )
                 __field(        u64,  num_bytes                 )
                 __field(        u64,  nr_old_roots              )
                 __field(        u64,  nr_new_roots              )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(fs_info,
                 __entry->bytenr         = bytenr;
                 __entry->num_bytes      = num_bytes;
                 __entry->nr_old_roots   = nr_old_roots;
                 __entry->nr_new_roots   = nr_new_roots;
         ),
  
-       TP_printk("bytenr = %llu, num_bytes = %llu, nr_old_roots = %llu, "
+       TP_printk_btrfs("bytenr = %llu, num_bytes = %llu, nr_old_roots = %llu, "
                   "nr_new_roots = %llu",
                   __entry->bytenr,
                   __entry->num_bytes,
@@ -1423,23 +1445,24 @@ TRACE_EVENT(btrfs_qgroup_account_extent,
  
  TRACE_EVENT(qgroup_update_counters,
  
-       TP_PROTO(u64 qgid, u64 cur_old_count, u64 cur_new_count),
+       TP_PROTO(struct btrfs_fs_info *fs_info, u64 qgid,
+                u64 cur_old_count, u64 cur_new_count),
  
-       TP_ARGS(qgid, cur_old_count, cur_new_count),
+       TP_ARGS(fs_info, qgid, cur_old_count, cur_new_count),
  
-       TP_STRUCT__entry(
+       TP_STRUCT__entry_btrfs(
                 __field(        u64,  qgid                      )
                 __field(        u64,  cur_old_count             )
                 __field(        u64,  cur_new_count             )
         ),
  
-       TP_fast_assign(
+       TP_fast_assign_btrfs(fs_info,
                 __entry->qgid           = qgid;
                 __entry->cur_old_count  = cur_old_count;
                 __entry->cur_new_count  = cur_new_count;
         ),
  
-       TP_printk("qgid = %llu, cur_old_count = %llu, cur_new_count = %llu",
+       TP_printk_btrfs("qgid = %llu, cur_old_count = %llu, cur_new_count = %llu",
                   __entry->qgid,
                   __entry->cur_old_count,
                   __entry->cur_new_count)
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h

index 003dca933803901da37efb168833b67a3a1f06ce..8a707f8a41c3df76b682032cf180eda869a49e1d 100644 (file)
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -473,6 +473,39 @@ TRACE_EVENT(svc_recv,
                         show_rqstp_flags(__entry->flags))
  );
  
+DECLARE_EVENT_CLASS(svc_rqst_event,
+
+       TP_PROTO(struct svc_rqst *rqst),
+
+       TP_ARGS(rqst),
+
+       TP_STRUCT__entry(
+               __field(__be32, xid)
+               __field(unsigned long, flags)
+               __dynamic_array(unsigned char, addr, rqst->rq_addrlen)
+       ),
+
+       TP_fast_assign(
+               __entry->xid = rqst->rq_xid;
+               __entry->flags = rqst->rq_flags;
+               memcpy(__get_dynamic_array(addr),
+                       &rqst->rq_addr, rqst->rq_addrlen);
+       ),
+
+       TP_printk("addr=%pIScp rq_xid=0x%x flags=%s",
+               (struct sockaddr *)__get_dynamic_array(addr),
+               be32_to_cpu(__entry->xid),
+               show_rqstp_flags(__entry->flags))
+);
+
+DEFINE_EVENT(svc_rqst_event, svc_defer,
+       TP_PROTO(struct svc_rqst *rqst),
+       TP_ARGS(rqst));
+
+DEFINE_EVENT(svc_rqst_event, svc_drop,
+       TP_PROTO(struct svc_rqst *rqst),
+       TP_ARGS(rqst));
+
  DECLARE_EVENT_CLASS(svc_rqst_status,
  
         TP_PROTO(struct svc_rqst *rqst, int status),
@@ -529,45 +562,67 @@ TRACE_EVENT(svc_xprt_do_enqueue,
  
         TP_STRUCT__entry(
                 __field(struct svc_xprt *, xprt)
-               __field_struct(struct sockaddr_storage, ss)
                 __field(int, pid)
                 __field(unsigned long, flags)
+               __dynamic_array(unsigned char, addr, xprt != NULL ?
+                       xprt->xpt_remotelen : 0)
         ),
  
         TP_fast_assign(
                 __entry->xprt = xprt;
-               xprt ? memcpy(&__entry->ss, &xprt->xpt_remote, sizeof(__entry->ss)) : memset(&__entry->ss, 0, sizeof(__entry->ss));
                 __entry->pid = rqst? rqst->rq_task->pid : 0;
-               __entry->flags = xprt ? xprt->xpt_flags : 0;
+               if (xprt) {
+                       memcpy(__get_dynamic_array(addr),
+                               &xprt->xpt_remote,
+                               xprt->xpt_remotelen);
+                       __entry->flags = xprt->xpt_flags;
+               } else
+                       __entry->flags = 0;
         ),
  
         TP_printk("xprt=0x%p addr=%pIScp pid=%d flags=%s", __entry->xprt,
-               (struct sockaddr *)&__entry->ss,
+               __get_dynamic_array_len(addr) != 0 ?
+                       (struct sockaddr *)__get_dynamic_array(addr) : NULL,
                 __entry->pid, show_svc_xprt_flags(__entry->flags))
  );
  
-TRACE_EVENT(svc_xprt_dequeue,
+DECLARE_EVENT_CLASS(svc_xprt_event,
         TP_PROTO(struct svc_xprt *xprt),
  
         TP_ARGS(xprt),
  
         TP_STRUCT__entry(
                 __field(struct svc_xprt *, xprt)
-               __field_struct(struct sockaddr_storage, ss)
                 __field(unsigned long, flags)
+               __dynamic_array(unsigned char, addr, xprt != NULL ?
+                       xprt->xpt_remotelen : 0)
         ),
  
         TP_fast_assign(
-               __entry->xprt = xprt,
-               xprt ? memcpy(&__entry->ss, &xprt->xpt_remote, sizeof(__entry->ss)) : memset(&__entry->ss, 0, sizeof(__entry->ss));
-               __entry->flags = xprt ? xprt->xpt_flags : 0;
+               __entry->xprt = xprt;
+               if (xprt) {
+                       memcpy(__get_dynamic_array(addr),
+                                       &xprt->xpt_remote,
+                                       xprt->xpt_remotelen);
+                       __entry->flags = xprt->xpt_flags;
+               } else
+                       __entry->flags = 0;
         ),
  
         TP_printk("xprt=0x%p addr=%pIScp flags=%s", __entry->xprt,
-               (struct sockaddr *)&__entry->ss,
+               __get_dynamic_array_len(addr) != 0 ?
+                       (struct sockaddr *)__get_dynamic_array(addr) : NULL,
                 show_svc_xprt_flags(__entry->flags))
  );
  
+DEFINE_EVENT(svc_xprt_event, svc_xprt_dequeue,
+       TP_PROTO(struct svc_xprt *xprt),
+       TP_ARGS(xprt));
+
+DEFINE_EVENT(svc_xprt_event, svc_xprt_no_write_space,
+       TP_PROTO(struct svc_xprt *xprt),
+       TP_ARGS(xprt));
+
  TRACE_EVENT(svc_wake_up,
         TP_PROTO(int pid),
  
@@ -592,21 +647,56 @@ TRACE_EVENT(svc_handle_xprt,
         TP_STRUCT__entry(
                 __field(struct svc_xprt *, xprt)
                 __field(int, len)
-               __field_struct(struct sockaddr_storage, ss)
                 __field(unsigned long, flags)
+               __dynamic_array(unsigned char, addr, xprt != NULL ?
+                       xprt->xpt_remotelen : 0)
         ),
  
         TP_fast_assign(
                 __entry->xprt = xprt;
-               xprt ? memcpy(&__entry->ss, &xprt->xpt_remote, sizeof(__entry->ss)) : memset(&__entry->ss, 0, sizeof(__entry->ss));
                 __entry->len = len;
-               __entry->flags = xprt ? xprt->xpt_flags : 0;
+               if (xprt) {
+                       memcpy(__get_dynamic_array(addr),
+                                       &xprt->xpt_remote,
+                                       xprt->xpt_remotelen);
+                       __entry->flags = xprt->xpt_flags;
+               } else
+                       __entry->flags = 0;
         ),
  
         TP_printk("xprt=0x%p addr=%pIScp len=%d flags=%s", __entry->xprt,
-               (struct sockaddr *)&__entry->ss,
+               __get_dynamic_array_len(addr) != 0 ?
+                       (struct sockaddr *)__get_dynamic_array(addr) : NULL,
                 __entry->len, show_svc_xprt_flags(__entry->flags))
  );
+
+
+DECLARE_EVENT_CLASS(svc_deferred_event,
+       TP_PROTO(struct svc_deferred_req *dr),
+
+       TP_ARGS(dr),
+
+       TP_STRUCT__entry(
+               __field(__be32, xid)
+               __dynamic_array(unsigned char, addr, dr->addrlen)
+       ),
+
+       TP_fast_assign(
+               __entry->xid = *(__be32 *)(dr->args + (dr->xprt_hlen>>2));
+               memcpy(__get_dynamic_array(addr), &dr->addr, dr->addrlen);
+       ),
+
+       TP_printk("addr=%pIScp xid=0x%x",
+               (struct sockaddr *)__get_dynamic_array(addr),
+               be32_to_cpu(__entry->xid))
+);
+
+DEFINE_EVENT(svc_deferred_event, svc_drop_deferred,
+       TP_PROTO(struct svc_deferred_req *dr),
+       TP_ARGS(dr));
+DEFINE_EVENT(svc_deferred_event, svc_revisit_deferred,
+       TP_PROTO(struct svc_deferred_req *dr),
+       TP_ARGS(dr));
  #endif /* _TRACE_SUNRPC_H */
  
  #include <trace/define_trace.h>
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h

index 2bdd1e3e70076dd3476e3398452c4837b2b80f9b..ac5eacd3055b503f9b95e330600608bf5a018cec 100644 (file)
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -798,7 +798,7 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code)
  #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
                                         struct btrfs_ioctl_ino_path_args)
  #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
-                                       struct btrfs_ioctl_ino_path_args)
+                                       struct btrfs_ioctl_logical_ino_args)
  #define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
                                 struct btrfs_ioctl_received_subvol_args)
  #define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args)
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h

index 70b172ba41ce34ce846567a9e8f16aba53d6d166..b59ee077a5964a888be764fdb5b817e3e377bcc0 100644 (file)
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -382,6 +382,19 @@ typedef struct elf64_shdr {
  #define NT_PPC_VMX     0x100           /* PowerPC Altivec/VMX registers */
  #define NT_PPC_SPE     0x101           /* PowerPC SPE/EVR registers */
  #define NT_PPC_VSX     0x102           /* PowerPC VSX registers */
+#define NT_PPC_TAR     0x103           /* Target Address Register */
+#define NT_PPC_PPR     0x104           /* Program Priority Register */
+#define NT_PPC_DSCR    0x105           /* Data Stream Control Register */
+#define NT_PPC_EBB     0x106           /* Event Based Branch Registers */
+#define NT_PPC_PMU     0x107           /* Performance Monitor Registers */
+#define NT_PPC_TM_CGPR 0x108           /* TM checkpointed GPR Registers */
+#define NT_PPC_TM_CFPR 0x109           /* TM checkpointed FPR Registers */
+#define NT_PPC_TM_CVMX 0x10a           /* TM checkpointed VMX Registers */
+#define NT_PPC_TM_CVSX 0x10b           /* TM checkpointed VSX Registers */
+#define NT_PPC_TM_SPR  0x10c           /* TM Special Purpose Registers */
+#define NT_PPC_TM_CTAR 0x10d           /* TM checkpointed Target Address Register */
+#define NT_PPC_TM_CPPR 0x10e           /* TM checkpointed Program Priority Register */
+#define NT_PPC_TM_CDSCR        0x10f           /* TM checkpointed Data Stream Control Register */
  #define NT_386_TLS     0x200           /* i386 TLS slots (struct user_desc) */
  #define NT_386_IOPERM  0x201           /* x86 io permission bitmap (1=deny) */
  #define NT_X86_XSTATE  0x202           /* x86 extended state using xsave */
diff --git a/include/uapi/rdma/Kbuild b/include/uapi/rdma/Kbuild

index 231901b08f6ce750501c7734ca4567a14caebf4b..4edb0f2b4f9f73fcab7c85081cee0729e6c231a3 100644 (file)
--- a/include/uapi/rdma/Kbuild
+++ b/include/uapi/rdma/Kbuild
@@ -6,3 +6,4 @@ header-y += ib_user_verbs.h
  header-y += rdma_netlink.h
  header-y += rdma_user_cm.h
  header-y += hfi/
+header-y += rdma_user_rxe.h
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h

index 98bebf8bef55e41b4e602646394c0b45dfd87d52..d15e7289d8356ec36bdb9d14b5d6e341de228519 100644 (file)
--- a/include/uapi/rdma/hfi/hfi1_user.h
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -75,7 +75,7 @@
   * may not be implemented; the user code must deal with this if it
   * cares, or it must abort after initialization reports the difference.
   */
-#define HFI1_USER_SWMINOR 1
+#define HFI1_USER_SWMINOR 2
  
  /*
   * We will encode the major/minor inside a single 32bit version number.
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h

index b6543d73d20a7fd03a0e77a2615679bad978d74a..7f035f4b53b0f8ec74e873bc32382ddf9f234382 100644 (file)
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -95,6 +95,11 @@ enum {
         IB_USER_VERBS_EX_CMD_CREATE_QP = IB_USER_VERBS_CMD_CREATE_QP,
         IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD,
         IB_USER_VERBS_EX_CMD_DESTROY_FLOW,
+       IB_USER_VERBS_EX_CMD_CREATE_WQ,
+       IB_USER_VERBS_EX_CMD_MODIFY_WQ,
+       IB_USER_VERBS_EX_CMD_DESTROY_WQ,
+       IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL,
+       IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL
  };
  
  /*
@@ -518,6 +523,14 @@ struct ib_uverbs_create_qp {
         __u64 driver_data[0];
  };
  
+enum ib_uverbs_create_qp_mask {
+       IB_UVERBS_CREATE_QP_MASK_IND_TABLE = 1UL << 0,
+};
+
+enum {
+       IB_UVERBS_CREATE_QP_SUP_COMP_MASK = IB_UVERBS_CREATE_QP_MASK_IND_TABLE,
+};
+
  struct ib_uverbs_ex_create_qp {
         __u64 user_handle;
         __u32 pd_handle;
@@ -535,6 +548,8 @@ struct ib_uverbs_ex_create_qp {
         __u8 reserved;
         __u32 comp_mask;
         __u32 create_flags;
+       __u32 rwq_ind_tbl_handle;
+       __u32  reserved1;
  };
  
  struct ib_uverbs_open_qp {
@@ -852,6 +867,24 @@ struct ib_uverbs_flow_spec_tcp_udp {
         struct ib_uverbs_flow_tcp_udp_filter mask;
  };
  
+struct ib_uverbs_flow_ipv6_filter {
+       __u8 src_ip[16];
+       __u8 dst_ip[16];
+};
+
+struct ib_uverbs_flow_spec_ipv6 {
+       union {
+               struct ib_uverbs_flow_spec_hdr hdr;
+               struct {
+                       __u32 type;
+                       __u16 size;
+                       __u16 reserved;
+               };
+       };
+       struct ib_uverbs_flow_ipv6_filter val;
+       struct ib_uverbs_flow_ipv6_filter mask;
+};
+
  struct ib_uverbs_flow_attr {
         __u32 type;
         __u16 size;
@@ -946,4 +979,66 @@ struct ib_uverbs_destroy_srq_resp {
         __u32 events_reported;
  };
  
+struct ib_uverbs_ex_create_wq  {
+       __u32 comp_mask;
+       __u32 wq_type;
+       __u64 user_handle;
+       __u32 pd_handle;
+       __u32 cq_handle;
+       __u32 max_wr;
+       __u32 max_sge;
+};
+
+struct ib_uverbs_ex_create_wq_resp {
+       __u32 comp_mask;
+       __u32 response_length;
+       __u32 wq_handle;
+       __u32 max_wr;
+       __u32 max_sge;
+       __u32 wqn;
+};
+
+struct ib_uverbs_ex_destroy_wq  {
+       __u32 comp_mask;
+       __u32 wq_handle;
+};
+
+struct ib_uverbs_ex_destroy_wq_resp {
+       __u32 comp_mask;
+       __u32 response_length;
+       __u32 events_reported;
+       __u32 reserved;
+};
+
+struct ib_uverbs_ex_modify_wq  {
+       __u32 attr_mask;
+       __u32 wq_handle;
+       __u32 wq_state;
+       __u32 curr_wq_state;
+};
+
+/* Prevent memory allocation rather than max expected size */
+#define IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE 0x0d
+struct ib_uverbs_ex_create_rwq_ind_table  {
+       __u32 comp_mask;
+       __u32 log_ind_tbl_size;
+       /* Following are the wq handles according to log_ind_tbl_size
+        * wq_handle1
+        * wq_handle2
+        */
+       __u32 wq_handles[0];
+};
+
+struct ib_uverbs_ex_create_rwq_ind_table_resp {
+       __u32 comp_mask;
+       __u32 response_length;
+       __u32 ind_tbl_handle;
+       __u32 ind_tbl_num;
+};
+
+struct ib_uverbs_ex_destroy_rwq_ind_table  {
+       __u32 comp_mask;
+       __u32 ind_tbl_handle;
+};
+
  #endif /* IB_USER_VERBS_H */
diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h

index 3066718eb12087ec22eb22e518a4f1756da33b21..01923d463673efdb527bb847f15683c17e323dd6 100644 (file)
--- a/include/uapi/rdma/rdma_user_cm.h
+++ b/include/uapi/rdma/rdma_user_cm.h
@@ -244,12 +244,19 @@ struct rdma_ucm_join_ip_mcast {
         __u32 id;
  };
  
+/* Multicast join flags */
+enum {
+       RDMA_MC_JOIN_FLAG_FULLMEMBER,
+       RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER,
+       RDMA_MC_JOIN_FLAG_RESERVED,
+};
+
  struct rdma_ucm_join_mcast {
         __u64 response;         /* rdma_ucma_create_id_resp */
         __u64 uid;
         __u32 id;
         __u16 addr_size;
-       __u16 reserved;
+       __u16 join_flags;
         struct sockaddr_storage addr;
  };
  
diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h

new file mode 100644 (file)

index 0000000..1de99cf
--- /dev/null
+++ b/include/uapi/rdma/rdma_user_rxe.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_USER_RXE_H
+#define RDMA_USER_RXE_H
+
+#include <linux/types.h>
+
+union rxe_gid {
+       __u8    raw[16];
+       struct {
+               __be64  subnet_prefix;
+               __be64  interface_id;
+       } global;
+};
+
+struct rxe_global_route {
+       union rxe_gid   dgid;
+       __u32           flow_label;
+       __u8            sgid_index;
+       __u8            hop_limit;
+       __u8            traffic_class;
+};
+
+struct rxe_av {
+       __u8                    port_num;
+       __u8                    network_type;
+       struct rxe_global_route grh;
+       union {
+               struct sockaddr         _sockaddr;
+               struct sockaddr_in      _sockaddr_in;
+               struct sockaddr_in6     _sockaddr_in6;
+       } sgid_addr, dgid_addr;
+};
+
+struct rxe_send_wr {
+       __u64                   wr_id;
+       __u32                   num_sge;
+       __u32                   opcode;
+       __u32                   send_flags;
+       union {
+               __be32          imm_data;
+               __u32           invalidate_rkey;
+       } ex;
+       union {
+               struct {
+                       __u64   remote_addr;
+                       __u32   rkey;
+               } rdma;
+               struct {
+                       __u64   remote_addr;
+                       __u64   compare_add;
+                       __u64   swap;
+                       __u32   rkey;
+               } atomic;
+               struct {
+                       __u32   remote_qpn;
+                       __u32   remote_qkey;
+                       __u16   pkey_index;
+               } ud;
+               struct {
+                       struct ib_mr *mr;
+                       __u32        key;
+                       int          access;
+               } reg;
+       } wr;
+};
+
+struct rxe_sge {
+       __u64   addr;
+       __u32   length;
+       __u32   lkey;
+};
+
+struct mminfo {
+       __u64                   offset;
+       __u32                   size;
+       __u32                   pad;
+};
+
+struct rxe_dma_info {
+       __u32                   length;
+       __u32                   resid;
+       __u32                   cur_sge;
+       __u32                   num_sge;
+       __u32                   sge_offset;
+       union {
+               __u8            inline_data[0];
+               struct rxe_sge  sge[0];
+       };
+};
+
+struct rxe_send_wqe {
+       struct rxe_send_wr      wr;
+       struct rxe_av           av;
+       __u32                   status;
+       __u32                   state;
+       __u64                   iova;
+       __u32                   mask;
+       __u32                   first_psn;
+       __u32                   last_psn;
+       __u32                   ack_length;
+       __u32                   ssn;
+       __u32                   has_rd_atomic;
+       struct rxe_dma_info     dma;
+};
+
+struct rxe_recv_wqe {
+       __u64                   wr_id;
+       __u32                   num_sge;
+       __u32                   padding;
+       struct rxe_dma_info     dma;
+};
+
+#endif /* RDMA_USER_RXE_H */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c

index 20400055f177cd9ab83748fd740a01036747df90..93ad6c1fb9b6212e706eb3ae08f7b881192008ec 100644 (file)
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -288,6 +288,9 @@ void __init jump_label_init(void)
         BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0);
         BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1);
  
+       if (static_key_initialized)
+               return;
+
         jump_label_lock();
         jump_label_sort_entries(iter_start, iter_stop);
  
diff --git a/mm/Kconfig b/mm/Kconfig

index c0837845c17c408a123d5864c1bec354ee074723..78a23c5c302d96ad6ef1198a02deb39a9a6bc228 100644 (file)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -187,6 +187,7 @@ config MEMORY_HOTPLUG
         bool "Allow for memory hot-add"
         depends on SPARSEMEM || X86_64_ACPI_NUMA
         depends on ARCH_ENABLE_MEMORY_HOTPLUG
+       depends on !KASAN
  
  config MEMORY_HOTPLUG_SPARSE
         def_bool y
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index ef968306fd5b8bc4cd9b06f402595d041f81a26f..b9aa1b0b38b0ecdb769cd33a8d77c0ffda2621bf 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3942,6 +3942,14 @@ same_page:
         return i ? i : -EFAULT;
  }
  
+#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
+/*
+ * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
+ * implement this.
+ */
+#define flush_hugetlb_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
+#endif
+
  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                 unsigned long address, unsigned long end, pgprot_t newprot)
  {
@@ -4002,7 +4010,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
          * once we release i_mmap_rwsem, another task can do the final put_page
          * and that page table be reused and filled with junk.
          */
-       flush_tlb_range(vma, start, end);
+       flush_hugetlb_tlb_range(vma, start, end);
         mmu_notifier_invalidate_range(mm, start, end);
         i_mmap_unlock_write(vma->vm_file->f_mapping);
         mmu_notifier_invalidate_range_end(mm, start, end);
diff --git a/mm/memblock.c b/mm/memblock.c

index ff5ff3b5f1ea774403b2231aa3053865a52ee31d..483197ef613f258838c40ca4e9869bc126e7e9ea 100644 (file)
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -482,7 +482,7 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
   * @flags:     flags of the new region
   *
   * Insert new memblock region [@base,@base+@size) into @type at @idx.
- * @type must already have extra room to accomodate the new region.
+ * @type must already have extra room to accommodate the new region.
   */
  static void __init_memblock memblock_insert_region(struct memblock_type *type,
                                                    int idx, phys_addr_t base,
@@ -544,7 +544,7 @@ repeat:
         /*
          * The following is executed twice.  Once with %false @insert and
          * then with %true.  The first counts the number of regions needed
-        * to accomodate the new area.  The second actually inserts them.
+        * to accommodate the new area.  The second actually inserts them.
          */
         base = obase;
         nr_new = 0;
@@ -994,7 +994,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
  
         if (*idx == (u64)ULLONG_MAX) {
                 idx_a = type_a->cnt - 1;
-               idx_b = type_b->cnt;
+               if (type_b != NULL)
+                       idx_b = type_b->cnt;
+               else
+                       idx_b = 0;
         }
  
         for (; idx_a >= 0; idx_a--) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 39a372a2a1d628a58eb5f02d3a27b3e0989b37f9..fb975cec351821151a422fb34171121f67459228 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5257,11 +5257,6 @@ static void __meminit setup_zone_pageset(struct zone *zone)
         zone->pageset = alloc_percpu(struct per_cpu_pageset);
         for_each_possible_cpu(cpu)
                 zone_pageset_init(zone, cpu);
-
-       if (!zone->zone_pgdat->per_cpu_nodestats) {
-               zone->zone_pgdat->per_cpu_nodestats =
-                       alloc_percpu(struct per_cpu_nodestat);
-       }
  }
  
  /*
@@ -5270,10 +5265,15 @@ static void __meminit setup_zone_pageset(struct zone *zone)
   */
  void __init setup_per_cpu_pageset(void)
  {
+       struct pglist_data *pgdat;
         struct zone *zone;
  
         for_each_populated_zone(zone)
                 setup_zone_pageset(zone);
+
+       for_each_online_pgdat(pgdat)
+               pgdat->per_cpu_nodestats =
+                       alloc_percpu(struct per_cpu_nodestat);
  }
  
  static noinline __ref
diff --git a/mm/slub.c b/mm/slub.c

index 26eb6a99540e8530493bf2a28e88c4edc17fba13..850737bdfbd82410dcd9e0e87d64ea808b0e39c7 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,7 +124,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
  #endif
  }
  
-inline void *fixup_red_left(struct kmem_cache *s, void *p)
+void *fixup_red_left(struct kmem_cache *s, void *p)
  {
         if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
                 p += s->red_left_pad;
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c

index e085f5ae1548194603de4af635388624daec6516..1d281816f2bf14e34a71932863e5579e4a5caa35 100644 (file)
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1230,8 +1230,9 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
         if (status)
                 goto out;
  
-       dprintk("RPC:       svcauth_gss: gss major status = %d\n",
-                       ud.major_status);
+       dprintk("RPC:       svcauth_gss: gss major status = %d "
+                       "minor status = %d\n",
+                       ud.major_status, ud.minor_status);
  
         switch (ud.major_status) {
         case GSS_S_CONTINUE_NEEDED:
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c

index 553bf95f700301a4821d4317e9277f784370bd22..4d8e11f94a35fb12390c291330c56331512f13d5 100644 (file)
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -362,7 +362,7 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
         cache_purge(cd);
         spin_lock(&cache_list_lock);
         write_lock(&cd->hash_lock);
-       if (cd->entries || atomic_read(&cd->inuse)) {
+       if (cd->entries) {
                 write_unlock(&cd->hash_lock);
                 spin_unlock(&cache_list_lock);
                 goto out;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c

index 4f01f63102ee5d1ce936f54fa8feb3cedfc15607..c3f652395a80b8ded540bc60fe235ce504e239f7 100644 (file)
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -21,6 +21,10 @@
  
  #define RPCDBG_FACILITY        RPCDBG_SVCXPRT
  
+static unsigned int svc_rpc_per_connection_limit __read_mostly;
+module_param(svc_rpc_per_connection_limit, uint, 0644);
+
+
  static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
  static int svc_deferred_recv(struct svc_rqst *rqstp);
  static struct cache_deferred_req *svc_defer(struct cache_req *req);
@@ -329,12 +333,45 @@ char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
  }
  EXPORT_SYMBOL_GPL(svc_print_addr);
  
+static bool svc_xprt_slots_in_range(struct svc_xprt *xprt)
+{
+       unsigned int limit = svc_rpc_per_connection_limit;
+       int nrqsts = atomic_read(&xprt->xpt_nr_rqsts);
+
+       return limit == 0 || (nrqsts >= 0 && nrqsts < limit);
+}
+
+static bool svc_xprt_reserve_slot(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+{
+       if (!test_bit(RQ_DATA, &rqstp->rq_flags)) {
+               if (!svc_xprt_slots_in_range(xprt))
+                       return false;
+               atomic_inc(&xprt->xpt_nr_rqsts);
+               set_bit(RQ_DATA, &rqstp->rq_flags);
+       }
+       return true;
+}
+
+static void svc_xprt_release_slot(struct svc_rqst *rqstp)
+{
+       struct svc_xprt *xprt = rqstp->rq_xprt;
+       if (test_and_clear_bit(RQ_DATA, &rqstp->rq_flags)) {
+               atomic_dec(&xprt->xpt_nr_rqsts);
+               svc_xprt_enqueue(xprt);
+       }
+}
+
  static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
  {
         if (xprt->xpt_flags & ((1<<XPT_CONN)|(1<<XPT_CLOSE)))
                 return true;
-       if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED)))
-               return xprt->xpt_ops->xpo_has_wspace(xprt);
+       if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED))) {
+               if (xprt->xpt_ops->xpo_has_wspace(xprt) &&
+                   svc_xprt_slots_in_range(xprt))
+                       return true;
+               trace_svc_xprt_no_write_space(xprt);
+               return false;
+       }
         return false;
  }
  
@@ -480,8 +517,6 @@ void svc_reserve(struct svc_rqst *rqstp, int space)
                 atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
                 rqstp->rq_reserved = space;
  
-               if (xprt->xpt_ops->xpo_adjust_wspace)
-                       xprt->xpt_ops->xpo_adjust_wspace(xprt);
                 svc_xprt_enqueue(xprt);
         }
  }
@@ -512,8 +547,8 @@ static void svc_xprt_release(struct svc_rqst *rqstp)
  
         rqstp->rq_res.head[0].iov_len = 0;
         svc_reserve(rqstp, 0);
+       svc_xprt_release_slot(rqstp);
         rqstp->rq_xprt = NULL;
-
         svc_xprt_put(xprt);
  }
  
@@ -781,7 +816,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
                         svc_add_new_temp_xprt(serv, newxpt);
                 else
                         module_put(xprt->xpt_class->xcl_owner);
-       } else {
+       } else if (svc_xprt_reserve_slot(rqstp, xprt)) {
                 /* XPT_DATA|XPT_DEFERRED case: */
                 dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
                         rqstp, rqstp->rq_pool->sp_id, xprt,
@@ -871,6 +906,7 @@ EXPORT_SYMBOL_GPL(svc_recv);
   */
  void svc_drop(struct svc_rqst *rqstp)
  {
+       trace_svc_drop(rqstp);
         dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
         svc_xprt_release(rqstp);
  }
@@ -1148,6 +1184,7 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
                 spin_unlock(&xprt->xpt_lock);
                 dprintk("revisit canceled\n");
                 svc_xprt_put(xprt);
+               trace_svc_drop_deferred(dr);
                 kfree(dr);
                 return;
         }
@@ -1205,6 +1242,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req)
         set_bit(RQ_DROPME, &rqstp->rq_flags);
  
         dr->handle.revisit = svc_revisit;
+       trace_svc_defer(rqstp);
         return &dr->handle;
  }
  
@@ -1245,6 +1283,7 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
                                 struct svc_deferred_req,
                                 handle.recent);
                 list_del_init(&dr->handle.recent);
+               trace_svc_revisit_deferred(dr);
         } else
                 clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
         spin_unlock(&xprt->xpt_lock);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c

index dadfec66dbd8abd301fa3ac307431dabe7999ec6..57625f64efd56edaa65fe0940aaa7979fe65ef87 100644 (file)
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -60,7 +60,6 @@
  
  static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
                                          int flags);
-static void            svc_udp_data_ready(struct sock *);
  static int             svc_udp_recvfrom(struct svc_rqst *);
  static int             svc_udp_sendto(struct svc_rqst *);
  static void            svc_sock_detach(struct svc_xprt *);
@@ -398,48 +397,21 @@ static int svc_sock_secure_port(struct svc_rqst *rqstp)
         return svc_port_is_privileged(svc_addr(rqstp));
  }
  
-static bool sunrpc_waitqueue_active(wait_queue_head_t *wq)
-{
-       if (!wq)
-               return false;
-       /*
-        * There should normally be a memory * barrier here--see
-        * wq_has_sleeper().
-        *
-        * It appears that isn't currently necessary, though, basically
-        * because callers all appear to have sufficient memory barriers
-        * between the time the relevant change is made and the
-        * time they call these callbacks.
-        *
-        * The nfsd code itself doesn't actually explicitly wait on
-        * these waitqueues, but it may wait on them for example in
-        * sendpage() or sendmsg() calls.  (And those may be the only
-        * places, since it it uses nonblocking reads.)
-        *
-        * Maybe we should add the memory barriers anyway, but these are
-        * hot paths so we'd need to be convinced there's no sigificant
-        * penalty.
-        */
-       return waitqueue_active(wq);
-}
-
  /*
   * INET callback when data has been received on the socket.
   */
-static void svc_udp_data_ready(struct sock *sk)
+static void svc_data_ready(struct sock *sk)
  {
         struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
-       wait_queue_head_t *wq = sk_sleep(sk);
  
         if (svsk) {
                 dprintk("svc: socket %p(inet %p), busy=%d\n",
                         svsk, sk,
                         test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
-               set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
-               svc_xprt_enqueue(&svsk->sk_xprt);
+               svsk->sk_odata(sk);
+               if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags))
+                       svc_xprt_enqueue(&svsk->sk_xprt);
         }
-       if (sunrpc_waitqueue_active(wq))
-               wake_up_interruptible(wq);
  }
  
  /*
@@ -448,56 +420,22 @@ static void svc_udp_data_ready(struct sock *sk)
  static void svc_write_space(struct sock *sk)
  {
         struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
-       wait_queue_head_t *wq = sk_sleep(sk);
  
         if (svsk) {
                 dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
                         svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+               svsk->sk_owspace(sk);
                 svc_xprt_enqueue(&svsk->sk_xprt);
         }
-
-       if (sunrpc_waitqueue_active(wq)) {
-               dprintk("RPC svc_write_space: someone sleeping on %p\n",
-                      svsk);
-               wake_up_interruptible(wq);
-       }
  }
  
  static int svc_tcp_has_wspace(struct svc_xprt *xprt)
  {
-       struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-       struct svc_serv *serv = svsk->sk_xprt.xpt_server;
-       int required;
+       struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
  
         if (test_bit(XPT_LISTENER, &xprt->xpt_flags))
                 return 1;
-       required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg;
-       if (sk_stream_wspace(svsk->sk_sk) >= required ||
-           (sk_stream_min_wspace(svsk->sk_sk) == 0 &&
-            atomic_read(&xprt->xpt_reserved) == 0))
-               return 1;
-       set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
-       return 0;
-}
-
-static void svc_tcp_write_space(struct sock *sk)
-{
-       struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
-       struct socket *sock = sk->sk_socket;
-
-       if (!sk_stream_is_writeable(sk) || !sock)
-               return;
-       if (!svsk || svc_tcp_has_wspace(&svsk->sk_xprt))
-               clear_bit(SOCK_NOSPACE, &sock->flags);
-       svc_write_space(sk);
-}
-
-static void svc_tcp_adjust_wspace(struct svc_xprt *xprt)
-{
-       struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-
-       if (svc_tcp_has_wspace(xprt))
-               clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+       return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
  }
  
  /*
@@ -746,7 +684,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
         svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class,
                       &svsk->sk_xprt, serv);
         clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
-       svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
+       svsk->sk_sk->sk_data_ready = svc_data_ready;
         svsk->sk_sk->sk_write_space = svc_write_space;
  
         /* initialise setting must have enough space to
@@ -786,11 +724,12 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
  static void svc_tcp_listen_data_ready(struct sock *sk)
  {
         struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
-       wait_queue_head_t *wq;
  
         dprintk("svc: socket %p TCP (listen) state change %d\n",
                 sk, sk->sk_state);
  
+       if (svsk)
+               svsk->sk_odata(sk);
         /*
          * This callback may called twice when a new connection
          * is established as a child socket inherits everything
@@ -808,10 +747,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
                 } else
                         printk("svc: socket %p: no user data\n", sk);
         }
-
-       wq = sk_sleep(sk);
-       if (sunrpc_waitqueue_active(wq))
-               wake_up_interruptible_all(wq);
  }
  
  /*
@@ -820,7 +755,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
  static void svc_tcp_state_change(struct sock *sk)
  {
         struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
-       wait_queue_head_t *wq = sk_sleep(sk);
  
         dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
                 sk, sk->sk_state, sk->sk_user_data);
@@ -828,26 +762,12 @@ static void svc_tcp_state_change(struct sock *sk)
         if (!svsk)
                 printk("svc: socket %p: no user data\n", sk);
         else {
-               set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
-               svc_xprt_enqueue(&svsk->sk_xprt);
-       }
-       if (sunrpc_waitqueue_active(wq))
-               wake_up_interruptible_all(wq);
-}
-
-static void svc_tcp_data_ready(struct sock *sk)
-{
-       struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
-       wait_queue_head_t *wq = sk_sleep(sk);
-
-       dprintk("svc: socket %p TCP data ready (svsk %p)\n",
-               sk, sk->sk_user_data);
-       if (svsk) {
-               set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
-               svc_xprt_enqueue(&svsk->sk_xprt);
+               svsk->sk_ostate(sk);
+               if (sk->sk_state != TCP_ESTABLISHED) {
+                       set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+                       svc_xprt_enqueue(&svsk->sk_xprt);
+               }
         }
-       if (sunrpc_waitqueue_active(wq))
-               wake_up_interruptible(wq);
  }
  
  /*
@@ -901,6 +821,11 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
         dprintk("%s: connect from %s\n", serv->sv_name,
                 __svc_print_addr(sin, buf, sizeof(buf)));
  
+       /* Reset the inherited callbacks before calling svc_setup_socket */
+       newsock->sk->sk_state_change = svsk->sk_ostate;
+       newsock->sk->sk_data_ready = svsk->sk_odata;
+       newsock->sk->sk_write_space = svsk->sk_owspace;
+
         /* make sure that a write doesn't block forever when
          * low on memory
          */
@@ -1317,7 +1242,6 @@ static struct svc_xprt_ops svc_tcp_ops = {
         .xpo_has_wspace = svc_tcp_has_wspace,
         .xpo_accept = svc_tcp_accept,
         .xpo_secure_port = svc_sock_secure_port,
-       .xpo_adjust_wspace = svc_tcp_adjust_wspace,
  };
  
  static struct svc_xprt_class svc_tcp_class = {
@@ -1357,8 +1281,8 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
         } else {
                 dprintk("setting up TCP socket for reading\n");
                 sk->sk_state_change = svc_tcp_state_change;
-               sk->sk_data_ready = svc_tcp_data_ready;
-               sk->sk_write_space = svc_tcp_write_space;
+               sk->sk_data_ready = svc_data_ready;
+               sk->sk_write_space = svc_write_space;
  
                 svsk->sk_reclen = 0;
                 svsk->sk_tcplen = 0;
@@ -1368,8 +1292,13 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
                 tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
  
                 set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
-               if (sk->sk_state != TCP_ESTABLISHED)
+               switch (sk->sk_state) {
+               case TCP_SYN_RECV:
+               case TCP_ESTABLISHED:
+                       break;
+               default:
                         set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+               }
         }
  }
  
@@ -1428,17 +1357,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
         /* Initialize the socket */
         if (sock->type == SOCK_DGRAM)
                 svc_udp_init(svsk, serv);
-       else {
-               /* initialise setting must have enough space to
-                * receive and respond to one request.
-                */
-               svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg,
-                                       4 * serv->sv_max_mesg);
+       else
                 svc_tcp_init(svsk, serv);
-       }
  
-       dprintk("svc: svc_setup_socket created %p (inet %p)\n",
-                               svsk, svsk->sk_sk);
+       dprintk("svc: svc_setup_socket created %p (inet %p), "
+                       "listen %d close %d\n",
+                       svsk, svsk->sk_sk,
+                       test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags),
+                       test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
  
         return svsk;
  }
@@ -1606,18 +1532,16 @@ static void svc_sock_detach(struct svc_xprt *xprt)
  {
         struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
         struct sock *sk = svsk->sk_sk;
-       wait_queue_head_t *wq;
  
         dprintk("svc: svc_sock_detach(%p)\n", svsk);
  
         /* put back the old socket callbacks */
+       lock_sock(sk);
         sk->sk_state_change = svsk->sk_ostate;
         sk->sk_data_ready = svsk->sk_odata;
         sk->sk_write_space = svsk->sk_owspace;
-
-       wq = sk_sleep(sk);
-       if (sunrpc_waitqueue_active(wq))
-               wake_up_interruptible(wq);
+       sk->sk_user_data = NULL;
+       release_sock(sk);
  }
  
  /*
diff --git a/sound/arm/Kconfig b/sound/arm/Kconfig

index e0406211716b003daae37efbc8cdfd73213b31f3..65171f6657a2984ccc14ce907a4c7c87d77d196b 100644 (file)
--- a/sound/arm/Kconfig
+++ b/sound/arm/Kconfig
@@ -9,14 +9,6 @@ menuconfig SND_ARM
           Drivers that are implemented on ASoC can be found in
           "ALSA for SoC audio support" section.
  
-config SND_PXA2XX_LIB
-       tristate
-       select SND_AC97_CODEC if SND_PXA2XX_LIB_AC97
-       select SND_DMAENGINE_PCM
-
-config SND_PXA2XX_LIB_AC97
-       bool
-
  if SND_ARM
  
  config SND_ARMAACI
@@ -42,3 +34,10 @@ config SND_PXA2XX_AC97
  
  endif  # SND_ARM
  
+config SND_PXA2XX_LIB
+       tristate
+       select SND_AC97_CODEC if SND_PXA2XX_LIB_AC97
+       select SND_DMAENGINE_PCM
+
+config SND_PXA2XX_LIB_AC97
+       bool
diff --git a/sound/hda/array.c b/sound/hda/array.c

index 516795baa7db60ad37aec14d4886b4ce5da4f78d..5dfa610e4471887c1c1570a61cd921c6493372de 100644 (file)
--- a/sound/hda/array.c
+++ b/sound/hda/array.c
@@ -21,13 +21,15 @@ void *snd_array_new(struct snd_array *array)
                 return NULL;
         if (array->used >= array->alloced) {
                 int num = array->alloced + array->alloc_align;
+               int oldsize = array->alloced * array->elem_size;
                 int size = (num + 1) * array->elem_size;
                 void *nlist;
                 if (snd_BUG_ON(num >= 4096))
                         return NULL;
-               nlist = krealloc(array->list, size, GFP_KERNEL | __GFP_ZERO);
+               nlist = krealloc(array->list, size, GFP_KERNEL);
                 if (!nlist)
                         return NULL;
+               memset(nlist + oldsize, 0, size - oldsize);
                 array->list = nlist;
                 array->alloced = num;
         }
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c

index 6f8ea13323c1819c27dbe10f2af0293415f06135..89dacf9b4e6cbcdd7caaed257ce77a510083c3c4 100644 (file)
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -2265,6 +2265,8 @@ static const struct pci_device_id azx_ids[] = {
         { PCI_DEVICE(0x1022, 0x780d),
           .driver_data = AZX_DRIVER_GENERIC | AZX_DCAPS_PRESET_ATI_SB },
         /* ATI HDMI */
+       { PCI_DEVICE(0x1002, 0x0002),
+         .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS },
         { PCI_DEVICE(0x1002, 0x1308),
           .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS },
         { PCI_DEVICE(0x1002, 0x157a),
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c

index ddd29b9819baa08157bea9e5982da665c6a9042d..574b1b48996f1d9199bd1349ca08b0f108199762 100644 (file)
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -4689,6 +4689,22 @@ static void alc290_fixup_mono_speakers(struct hda_codec *codec,
         }
  }
  
+static void alc298_fixup_speaker_volume(struct hda_codec *codec,
+                                       const struct hda_fixup *fix, int action)
+{
+       if (action == HDA_FIXUP_ACT_PRE_PROBE) {
+               /* The speaker is routed to the Node 0x06 by a mistake, as a result
+                  we can't adjust the speaker's volume since this node does not has
+                  Amp-out capability. we change the speaker's route to:
+                  Node 0x02 (Audio Output) -> Node 0x0c (Audio Mixer) -> Node 0x17 (
+                  Pin Complex), since Node 0x02 has Amp-out caps, we can adjust
+                  speaker's volume now. */
+
+               hda_nid_t conn1[1] = { 0x0c };
+               snd_hda_override_conn_list(codec, 0x17, 1, conn1);
+       }
+}
+
  /* Hook to update amp GPIO4 for automute */
  static void alc280_hp_gpio4_automute_hook(struct hda_codec *codec,
                                           struct hda_jack_callback *jack)
@@ -4838,6 +4854,7 @@ enum {
         ALC280_FIXUP_HP_HEADSET_MIC,
         ALC221_FIXUP_HP_FRONT_MIC,
         ALC292_FIXUP_TPT460,
+       ALC298_FIXUP_SPK_VOLUME,
  };
  
  static const struct hda_fixup alc269_fixups[] = {
@@ -5493,6 +5510,12 @@ static const struct hda_fixup alc269_fixups[] = {
                 .chained = true,
                 .chain_id = ALC293_FIXUP_LENOVO_SPK_NOISE,
         },
+       [ALC298_FIXUP_SPK_VOLUME] = {
+               .type = HDA_FIXUP_FUNC,
+               .v.func = alc298_fixup_speaker_volume,
+               .chained = true,
+               .chain_id = ALC298_FIXUP_DELL1_MIC_NO_PRESENCE,
+       },
  };
  
  static const struct snd_pci_quirk alc269_fixup_tbl[] = {
@@ -5539,6 +5562,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
         SND_PCI_QUIRK(0x1028, 0x0704, "Dell XPS 13 9350", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE),
         SND_PCI_QUIRK(0x1028, 0x0725, "Dell Inspiron 3162", ALC255_FIXUP_DELL_SPK_NOISE),
         SND_PCI_QUIRK(0x1028, 0x075b, "Dell XPS 13 9360", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE),
+       SND_PCI_QUIRK(0x1028, 0x075d, "Dell AIO", ALC298_FIXUP_SPK_VOLUME),
         SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
         SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
         SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2),
@@ -5813,6 +5837,10 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
                 {0x14, 0x90170130},
                 {0x1b, 0x01014020},
                 {0x21, 0x0221103f}),
+       SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
+               {0x14, 0x90170130},
+               {0x1b, 0x02011020},
+               {0x21, 0x0221103f}),
         SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
                 {0x14, 0x90170150},
                 {0x1b, 0x02011020},
@@ -6549,6 +6577,7 @@ enum {
         ALC668_FIXUP_ASUS_Nx51,
         ALC891_FIXUP_HEADSET_MODE,
         ALC891_FIXUP_DELL_MIC_NO_PRESENCE,
+       ALC662_FIXUP_ACER_VERITON,
  };
  
  static const struct hda_fixup alc662_fixups[] = {
@@ -6818,6 +6847,13 @@ static const struct hda_fixup alc662_fixups[] = {
                 .chained = true,
                 .chain_id = ALC891_FIXUP_HEADSET_MODE
         },
+       [ALC662_FIXUP_ACER_VERITON] = {
+               .type = HDA_FIXUP_PINS,
+               .v.pins = (const struct hda_pintbl[]) {
+                       { 0x15, 0x50170120 }, /* no internal speaker */
+                       { }
+               }
+       },
  };
  
  static const struct snd_pci_quirk alc662_fixup_tbl[] = {
@@ -6856,6 +6892,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = {
         SND_PCI_QUIRK(0x17aa, 0x38af, "Lenovo Ideapad Y550P", ALC662_FIXUP_IDEAPAD),
         SND_PCI_QUIRK(0x17aa, 0x3a0d, "Lenovo Ideapad Y550", ALC662_FIXUP_IDEAPAD),
         SND_PCI_QUIRK(0x19da, 0xa130, "Zotac Z68", ALC662_FIXUP_ZOTAC_Z68),
+       SND_PCI_QUIRK(0x1b0a, 0x01b8, "ACER Veriton", ALC662_FIXUP_ACER_VERITON),
         SND_PCI_QUIRK(0x1b35, 0x2206, "CZC P10T", ALC662_FIXUP_CZC_P10T),
  
  #if 0
diff --git a/tools/testing/selftests/timers/rtctest.c b/tools/testing/selftests/timers/rtctest.c

index 624bce51b27d26fad9a88528a71439125a20d183..4230d3052e5d5948939926866d649ad636726051 100644 (file)
--- a/tools/testing/selftests/timers/rtctest.c
+++ b/tools/testing/selftests/timers/rtctest.c
@@ -144,11 +144,12 @@ test_READ:
  
         retval = ioctl(fd, RTC_ALM_SET, &rtc_tm);
         if (retval == -1) {
-               if (errno == ENOTTY) {
+               if (errno == EINVAL) {
                         fprintf(stderr,
                                 "\n...Alarm IRQs not supported.\n");
                         goto test_PIE;
                 }
+
                 perror("RTC_ALM_SET ioctl");
                 exit(errno);
         }
@@ -166,6 +167,12 @@ test_READ:
         /* Enable alarm interrupts */
         retval = ioctl(fd, RTC_AIE_ON, 0);
         if (retval == -1) {
+               if (errno == EINVAL) {
+                       fprintf(stderr,
+                               "\n...Alarm IRQs not supported.\n");
+                       goto test_PIE;
+               }
+
                 perror("RTC_AIE_ON ioctl");
                 exit(errno);
         }
@@ -193,7 +200,7 @@ test_PIE:
         retval = ioctl(fd, RTC_IRQP_READ, &tmp);
         if (retval == -1) {
                 /* not all RTCs support periodic IRQs */
-               if (errno == ENOTTY) {
+               if (errno == EINVAL) {
                         fprintf(stderr, "\nNo periodic IRQ support\n");
                         goto done;
                 }
@@ -211,7 +218,7 @@ test_PIE:
                 retval = ioctl(fd, RTC_IRQP_SET, tmp);
                 if (retval == -1) {
                         /* not all RTCs can change their periodic IRQ rate */
-                       if (errno == ENOTTY) {
+                       if (errno == EINVAL) {
                                 fprintf(stderr,
                                         "\n...Periodic IRQ rate is fixed\n");
                                 goto done;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 6 Aug 2016 03:31:51 +0000 (23:31 -0400)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 6 Aug 2016 03:31:51 +0000 (23:31 -0400)