Add arm64 hardware assisted crc32c support
authorwei xiao <wei.xiao@arm.com>
Thu, 5 Jan 2017 02:32:40 +0000 (10:32 +0800)
committerwei xiao <wei.xiao@arm.com>
Thu, 5 Jan 2017 02:32:40 +0000 (10:32 +0800)
HW assisted crc32c get ~ x39.6 speedups on ARM Cortex-A57@2GHz

Auto-detect whether to use hw assisted crc32c.
If the hardware assisted crypto is available, always use it.
Otherwise, fallback to software.

HOWTO
Makefile
arch/arch-aarch64.h
configure
crc/crc32c-arm64.c [new file with mode: 0644]
crc/crc32c.h
crc/test.c
lib/bloom.c
options.c
verify.c
verify.h

diff --git a/HOWTO b/HOWTO
index 4354e465c5fd2aec62c5502b43d50b0d66629cd6..4cc733f60e01c728ecbc3d64b7b32114d060d8fd 100644 (file)
--- a/HOWTO
+++ b/HOWTO
@@ -1514,6 +1514,11 @@ verify=str       If writing to a file, fio can verify the file contents
                                back to regular software crc32c, if not
                                supported by the system.
 
+                       crc32c-arm64 Use hardware assisted crc32c calculation
+                               provided on CRC enabled ARM 64-bits processors.
+                               Falls back to regular software crc32c, if not
+                               supported by the system.
+
                        crc32   Use a crc32 sum of the data area and store
                                it in the header of each block.
 
index 4c641689d6bf98f109493ac767605e918228514a..ad02d935da8d166a93e40b0bcbe1445b1909544b 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -234,10 +234,10 @@ endif
 T_DEDUPE_OBJS = t/dedupe.o
 T_DEDUPE_OBJS += lib/rbtree.o t/log.o mutex.o smalloc.o gettime.o crc/md5.o \
                lib/memalign.o lib/bloom.o t/debug.o crc/xxhash.o t/arch.o \
-               crc/murmur3.o crc/crc32c.o crc/crc32c-intel.o crc/fnv.o
+               crc/murmur3.o crc/crc32c.o crc/crc32c-intel.o crc/crc32c-arm64.o crc/fnv.o
 T_DEDUPE_PROGS = t/fio-dedupe
 
-T_VS_OBJS = t/verify-state.o t/log.o crc/crc32c.o crc/crc32c-intel.o t/debug.o
+T_VS_OBJS = t/verify-state.o t/log.o crc/crc32c.o crc/crc32c-intel.o crc/crc32c-arm64.o t/debug.o
 T_VS_PROGS = t/fio-verify-state
 
 T_PIPE_ASYNC_OBJS = t/read-to-pipe-async.o
index 2a86cc5ab4d9b9cb1bef2dcf5d0aab24bfc38fb8..0912a86f63c4598ee9d673fe3a80da9f2d9d5065 100644 (file)
@@ -27,4 +27,8 @@ static inline int arch_ffz(unsigned long bitmask)
 
 #define ARCH_HAVE_FFZ
 
+#ifdef ARCH_HAVE_CRC_CRYPTO
+#define ARCH_HAVE_ARM64_CRC_CRYPTO
+#endif
+
 #endif
index fc1578221a7b69314983eea9f28f4fca499afc92..7de88f88b75b7cc5bb32f6d35919ffc99e785fc1 100755 (executable)
--- a/configure
+++ b/configure
@@ -342,6 +342,8 @@ elif check_define __s390__ ; then
   fi
 elif check_define __arm__ ; then
   cpu="arm"
+elif check_define __aarch64__ ; then
+  cpu="aarch64"
 elif check_define __hppa__ ; then
   cpu="hppa"
 else
@@ -362,6 +364,9 @@ case "$cpu" in
   armv*b|armv*l|arm)
     cpu="arm"
   ;;
+  aarch64)
+    cpu="arm64"
+  ;;
   hppa|parisc|parisc64)
     cpu="hppa"
   ;;
@@ -1780,6 +1785,24 @@ if compile_prog "" "" "bool"; then
 fi
 echo "bool                          $have_bool"
 
+##########################################
+# check march=armv8-a+crc+crypto
+march_armv8_a_crc_crypto="no"
+if test "$cpu" = "arm64" ; then
+  cat > $TMPC <<EOF
+int main(void)
+{
+  return 0;
+}
+EOF
+  if compile_prog "-march=armv8-a+crc+crypto" "" ""; then
+    march_armv8_a_crc_crypto="yes"
+    CFLAGS="$CFLAGS -march=armv8-a+crc+crypto -DARCH_HAVE_CRC_CRYPTO"
+  fi
+fi
+echo "march_armv8_a_crc_crypto      $march_armv8_a_crc_crypto"
+
+
 #############################################################################
 
 if test "$wordsize" = "64" ; then
diff --git a/crc/crc32c-arm64.c b/crc/crc32c-arm64.c
new file mode 100644 (file)
index 0000000..78fa64e
--- /dev/null
@@ -0,0 +1,115 @@
+#include "crc32c.h"
+
+#define CRC32C3X8(ITR) \
+       crc1 = __crc32cd(crc1, *((const uint64_t *)data + 42*1 + (ITR)));\
+       crc2 = __crc32cd(crc2, *((const uint64_t *)data + 42*2 + (ITR)));\
+       crc0 = __crc32cd(crc0, *((const uint64_t *)data + 42*0 + (ITR)));
+
+#define CRC32C7X3X8(ITR) do {\
+       CRC32C3X8((ITR)*7+0) \
+       CRC32C3X8((ITR)*7+1) \
+       CRC32C3X8((ITR)*7+2) \
+       CRC32C3X8((ITR)*7+3) \
+       CRC32C3X8((ITR)*7+4) \
+       CRC32C3X8((ITR)*7+5) \
+       CRC32C3X8((ITR)*7+6) \
+       } while(0)
+
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32             (1 << 7)
+#endif /* HWCAP_CRC32 */
+
+int crc32c_arm64_available = 0;
+
+#ifdef ARCH_HAVE_ARM64_CRC_CRYPTO
+
+#include <sys/auxv.h>
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+static int crc32c_probed;
+
+/*
+ * Function to calculate reflected crc with PMULL Instruction
+ * crc done "by 3" for fixed input block size of 1024 bytes
+ */
+uint32_t crc32c_arm64(unsigned char const *data, unsigned long length)
+{
+       signed long len = length;
+       uint32_t crc = ~0;
+       uint32_t crc0, crc1, crc2;
+
+       /* Load two consts: K1 and K2 */
+       const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
+       uint64_t t0, t1;
+
+       while ((len -= 1024) >= 0) {
+               /* Do first 8 bytes here for better pipelining */
+               crc0 = __crc32cd(crc, *(const uint64_t *)data);
+               crc1 = 0;
+               crc2 = 0;
+               data += sizeof(uint64_t);
+
+               /* Process block inline
+                  Process crc0 last to avoid dependency with above */
+               CRC32C7X3X8(0);
+               CRC32C7X3X8(1);
+               CRC32C7X3X8(2);
+               CRC32C7X3X8(3);
+               CRC32C7X3X8(4);
+               CRC32C7X3X8(5);
+
+               data += 42*3*sizeof(uint64_t);
+
+               /* Merge crc0 and crc1 into crc2
+                  crc1 multiply by K2
+                  crc0 multiply by K1 */
+
+               t1 = (uint64_t)vmull_p64(crc1, k2);
+               t0 = (uint64_t)vmull_p64(crc0, k1);
+               crc = __crc32cd(crc2, *(const uint64_t *)data);
+               crc1 = __crc32cd(0, t1);
+               crc ^= crc1;
+               crc0 = __crc32cd(0, t0);
+               crc ^= crc0;
+
+               data += sizeof(uint64_t);
+       }
+
+       if(!(len += 1024))
+               return crc;
+
+       while ((len -= sizeof(uint64_t)) >= 0) {
+                crc = __crc32cd(crc, *(const uint64_t *)data);
+                data += sizeof(uint64_t);
+        }
+
+        /* The following is more efficient than the straight loop */
+        if (len & sizeof(uint32_t)) {
+                crc = __crc32cw(crc, *(const uint32_t *)data);
+                data += sizeof(uint32_t);
+        }
+        if (len & sizeof(uint16_t)) {
+                crc = __crc32ch(crc, *(const uint16_t *)data);
+                data += sizeof(uint16_t);
+        }
+        if (len & sizeof(uint8_t)) {
+                crc = __crc32cb(crc, *(const uint8_t *)data);
+        }
+
+       return crc;
+}
+
+void crc32c_arm64_probe(void)
+{
+       unsigned long hwcap;
+       if (!crc32c_probed) {
+               hwcap = getauxval(AT_HWCAP);
+               if (hwcap & HWCAP_CRC32) {
+                       crc32c_arm64_available = 1;
+               }
+               crc32c_probed = 1;
+       }
+}
+
+#endif /* ARCH_HAVE_ARM64_CRC_CRYPTO */
index 11bcf9c8f4e8577efccf7eac467043bd572e617f..50d349bdddf33d0525654f07afe12aeb95c03695 100644 (file)
 #include "../arch/arch.h"
 
 extern uint32_t crc32c_sw(unsigned char const *, unsigned long);
+extern int crc32c_arm64_available;
 extern int crc32c_intel_available;
 
+#ifdef ARCH_HAVE_ARM64_CRC_CRYPTO
+extern uint32_t crc32c_arm64(unsigned char const *, unsigned long);
+extern void crc32c_arm64_probe(void);
+#else
+#define crc32c_arm64 crc32c_sw
+static inline void crc32c_arm64_probe(void)
+{
+}
+#endif
+
 #ifdef ARCH_HAVE_SSE4_2
 extern uint32_t crc32c_intel(unsigned char const *, unsigned long);
 extern void crc32c_intel_probe(void);
@@ -35,6 +46,10 @@ static inline void crc32c_intel_probe(void)
 
 static inline uint32_t fio_crc32c(unsigned char const *buf, unsigned long len)
 {
+       if (crc32c_arm64_available) {
+               return crc32c_arm64(buf, len);
+       }
+
        if (crc32c_intel_available)
                return crc32c_intel(buf, len);
 
index 300000d2c153b5417d5a2f3788fa9722ea42a69e..78f19ac5867d561b1eb9a9aa4a6cd9ff5aa7a18d 100644 (file)
@@ -291,6 +291,7 @@ int fio_crctest(const char *type)
        int i, first = 1;
        void *buf;
 
+       crc32c_arm64_probe();
        crc32c_intel_probe();
 
        if (!type)
index fa38db9551469f2eaaef29b5bce51bee2e71e487..7a9ebaa9360ce4285a15d2129fa859e45b9c4ee5 100644 (file)
@@ -65,6 +65,7 @@ struct bloom *bloom_new(uint64_t entries)
        struct bloom *b;
        size_t no_uints;
 
+       crc32c_arm64_probe();
        crc32c_intel_probe();
 
        b = malloc(sizeof(*b));
index 1ca16e84ad15a5c916bf832f03d7c6f4a9c94a62..5886c505b58f2466de4356a123a403ea6dcd01b2 100644 (file)
--- a/options.c
+++ b/options.c
@@ -2647,6 +2647,10 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                            .oval = VERIFY_CRC32C,
                            .help = "Use crc32c checksums for verification (hw assisted, if available)",
                          },
+                         { .ival = "crc32c-arm64",
+                           .oval = VERIFY_CRC32C,
+                           .help = "Use crc32c checksums for verification (hw assisted, if available)",
+                         },
                          { .ival = "crc32c",
                            .oval = VERIFY_CRC32C,
                            .help = "Use crc32c checksums for verification (hw assisted, if available)",
index 790ab31d3cddd68c0aa4379c9d811d048956efa1..8733febc7a545b3a51c9ee722fb9b872e77e736d 100644 (file)
--- a/verify.c
+++ b/verify.c
@@ -1210,7 +1210,9 @@ nothing:
 void fio_verify_init(struct thread_data *td)
 {
        if (td->o.verify == VERIFY_CRC32C_INTEL ||
+           td->o.verify == VERIFY_CRC32C_ARM64 ||
            td->o.verify == VERIFY_CRC32C) {
+               crc32c_arm64_probe();
                crc32c_intel_probe();
        }
 }
index deb161e2a99db3d96fb97395ae4041e4a389ed57..8d40ff66de0806d26f425e9c7eae001937285503 100644 (file)
--- a/verify.h
+++ b/verify.h
@@ -15,6 +15,7 @@ enum {
        VERIFY_CRC64,                   /* crc64 sum data blocks */
        VERIFY_CRC32,                   /* crc32 sum data blocks */
        VERIFY_CRC32C,                  /* crc32c sum data blocks */
+       VERIFY_CRC32C_ARM64,            /* crc32c sum data blocks with hw */
        VERIFY_CRC32C_INTEL,            /* crc32c sum data blocks with hw */
        VERIFY_CRC16,                   /* crc16 sum data blocks */
        VERIFY_CRC7,                    /* crc7 sum data blocks */