arm64/crc-t10dif: expose CRC-T10DIF function through lib

author Eric Biggers <ebiggers@google.com>

Mon, 2 Dec 2024 01:20:50 +0000 (17:20 -0800)

committer Eric Biggers <ebiggers@google.com>

Mon, 2 Dec 2024 01:23:13 +0000 (17:23 -0800)
author Eric Biggers <ebiggers@google.com>
Mon, 2 Dec 2024 01:20:50 +0000 (17:20 -0800)
committer Eric Biggers <ebiggers@google.com>
Mon, 2 Dec 2024 01:23:13 +0000 (17:23 -0800)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig

index 71f6310c8240eedb18ebbb66c5a94127937eb92d..cbfd357f94a68db3da9c2e580a36d7a361b4c430 100644 (file)
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -22,6 +22,7 @@ config ARM64
         select ARCH_HAS_CACHE_LINE_SIZE
         select ARCH_HAS_CC_PLATFORM
         select ARCH_HAS_CRC32
+       select ARCH_HAS_CRC_T10DIF if KERNEL_MODE_NEON
         select ARCH_HAS_CURRENT_STACK_POINTER
         select ARCH_HAS_DEBUG_VIRTUAL
         select ARCH_HAS_DEBUG_VM_PGTABLE
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig

index c62831e6158633f07c1f3532fba62f09b31e7448..9c0d6b93a3c20d9b56fd9ea7abb53be2af95de61 100644 (file)
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1698,7 +1698,6 @@ CONFIG_CRYPTO_SM3_ARM64_CE=m
  CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
  CONFIG_CRYPTO_AES_ARM64_BS=m
  CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
-CONFIG_CRYPTO_CRCT10DIF_ARM64_CE=m
  CONFIG_CRYPTO_DEV_SUN8I_CE=m
  CONFIG_CRYPTO_DEV_FSL_CAAM=m
  CONFIG_CRYPTO_DEV_FSL_DPAA2_CAAM=m
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig

index e7d9bd8e4709b6c4562fb07d0f4352a029b6ad06..5636ab83f22aee2428fe712b8d02ae4b77dfdb40 100644 (file)
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -312,15 +312,5 @@ config CRYPTO_SM4_ARM64_CE_GCM
           - PMULL (Polynomial Multiply Long) instructions
           - NEON (Advanced SIMD) extensions
  
-config CRYPTO_CRCT10DIF_ARM64_CE
-       tristate "CRCT10DIF (PMULL)"
-       depends on KERNEL_MODE_NEON && CRC_T10DIF
-       select CRYPTO_HASH
-       help
-         CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF)
-
-         Architecture: arm64 using
-         - PMULL (Polynomial Multiply Long) instructions
-
  endmenu
  
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile

index fbe64dce66e0af7d385bbe6496d8cb41b0e02678..e7139c4768ce4e909834bfcaa479f63bcd9a4aa7 100644 (file)
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -44,9 +44,6 @@ ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
  obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o
  polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
  
-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
-crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
-
  obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
  aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
  
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S

deleted file mode 100644 (file)

index 87dd6d4..0000000
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ /dev/null
@@ -1,469 +0,0 @@
-//
-// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
-//
-// Copyright (C) 2016 Linaro Ltd
-// Copyright (C) 2019-2024 Google LLC
-//
-// Authors: Ard Biesheuvel <ardb@google.com>
-//          Eric Biggers <ebiggers@google.com>
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License version 2 as
-// published by the Free Software Foundation.
-//
-
-// Derived from the x86 version:
-//
-// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
-//
-// Copyright (c) 2013, Intel Corporation
-//
-// Authors:
-//     Erdinc Ozturk <erdinc.ozturk@intel.com>
-//     Vinodh Gopal <vinodh.gopal@intel.com>
-//     James Guilford <james.guilford@intel.com>
-//     Tim Chen <tim.c.chen@linux.intel.com>
-//
-// This software is available to you under a choice of one of two
-// licenses.  You may choose to be licensed under the terms of the GNU
-// General Public License (GPL) Version 2, available from the file
-// COPYING in the main directory of this source tree, or the
-// OpenIB.org BSD license below:
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// * Neither the name of the Intel Corporation nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-//
-// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-//       Reference paper titled "Fast CRC Computation for Generic
-//     Polynomials Using PCLMULQDQ Instruction"
-//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
-//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-//
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-       .text
-       .arch           armv8-a+crypto
-
-       init_crc        .req    w0
-       buf             .req    x1
-       len             .req    x2
-       fold_consts_ptr .req    x5
-
-       fold_consts     .req    v10
-
-       t3              .req    v17
-       t4              .req    v18
-       t5              .req    v19
-       t6              .req    v20
-       t7              .req    v21
-       t8              .req    v22
-
-       perm            .req    v27
-
-       .macro          pmull16x64_p64, a16, b64, c64
-       pmull2          \c64\().1q, \a16\().2d, \b64\().2d
-       pmull           \b64\().1q, \a16\().1d, \b64\().1d
-       .endm
-
-       /*
-        * Pairwise long polynomial multiplication of two 16-bit values
-        *
-        *   { w0, w1 }, { y0, y1 }
-        *
-        * by two 64-bit values
-        *
-        *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
-        *
-        * where each vector element is a byte, ordered from least to most
-        * significant.
-        *
-        * This can be implemented using 8x8 long polynomial multiplication, by
-        * reorganizing the input so that each pairwise 8x8 multiplication
-        * produces one of the terms from the decomposition below, and
-        * combining the results of each rank and shifting them into place.
-        *
-        * Rank
-        *  0            w0*x0 ^              |        y0*z0 ^
-        *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
-        *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
-        *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
-        *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
-        *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
-        *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
-        *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
-        *  8            w1*x7      << 64     |        y1*z7      << 64
-        *
-        * The inputs can be reorganized into
-        *
-        *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
-        *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
-        *
-        * and after performing 8x8->16 bit long polynomial multiplication of
-        * each of the halves of the first vector with those of the second one,
-        * we obtain the following four vectors of 16-bit elements:
-        *
-        *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
-        *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
-        *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
-        *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
-        *
-        * Results b and c can be XORed together, as the vector elements have
-        * matching ranks. Then, the final XOR (*) can be pulled forward, and
-        * applied between the halves of each of the remaining three vectors,
-        * which are then shifted into place, and combined to produce two
-        * 80-bit results.
-        *
-        * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
-        * to the 64x64 bit one above, but XOR'ing the outputs together will
-        * produce the expected result, and this is sufficient in the context of
-        * this algorithm.
-        */
-       .macro          pmull16x64_p8, a16, b64, c64
-       ext             t7.16b, \b64\().16b, \b64\().16b, #1
-       tbl             t5.16b, {\a16\().16b}, perm.16b
-       uzp1            t7.16b, \b64\().16b, t7.16b
-       bl              __pmull_p8_16x64
-       ext             \b64\().16b, t4.16b, t4.16b, #15
-       eor             \c64\().16b, t8.16b, t5.16b
-       .endm
-
-SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
-       ext             t6.16b, t5.16b, t5.16b, #8
-
-       pmull           t3.8h, t7.8b, t5.8b
-       pmull           t4.8h, t7.8b, t6.8b
-       pmull2          t5.8h, t7.16b, t5.16b
-       pmull2          t6.8h, t7.16b, t6.16b
-
-       ext             t8.16b, t3.16b, t3.16b, #8
-       eor             t4.16b, t4.16b, t6.16b
-       ext             t7.16b, t5.16b, t5.16b, #8
-       ext             t6.16b, t4.16b, t4.16b, #8
-       eor             t8.8b, t8.8b, t3.8b
-       eor             t5.8b, t5.8b, t7.8b
-       eor             t4.8b, t4.8b, t6.8b
-       ext             t5.16b, t5.16b, t5.16b, #14
-       ret
-SYM_FUNC_END(__pmull_p8_16x64)
-
-
-       // Fold reg1, reg2 into the next 32 data bytes, storing the result back
-       // into reg1, reg2.
-       .macro          fold_32_bytes, p, reg1, reg2
-       ldp             q11, q12, [buf], #0x20
-
-       pmull16x64_\p   fold_consts, \reg1, v8
-
-CPU_LE(        rev64           v11.16b, v11.16b                )
-CPU_LE(        rev64           v12.16b, v12.16b                )
-
-       pmull16x64_\p   fold_consts, \reg2, v9
-
-CPU_LE(        ext             v11.16b, v11.16b, v11.16b, #8   )
-CPU_LE(        ext             v12.16b, v12.16b, v12.16b, #8   )
-
-       eor             \reg1\().16b, \reg1\().16b, v8.16b
-       eor             \reg2\().16b, \reg2\().16b, v9.16b
-       eor             \reg1\().16b, \reg1\().16b, v11.16b
-       eor             \reg2\().16b, \reg2\().16b, v12.16b
-       .endm
-
-       // Fold src_reg into dst_reg, optionally loading the next fold constants
-       .macro          fold_16_bytes, p, src_reg, dst_reg, load_next_consts
-       pmull16x64_\p   fold_consts, \src_reg, v8
-       .ifnb           \load_next_consts
-       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-       .endif
-       eor             \dst_reg\().16b, \dst_reg\().16b, v8.16b
-       eor             \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
-       .endm
-
-       .macro          crc_t10dif_pmull, p
-
-       // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
-       cmp             len, #256
-       b.lt            .Lless_than_256_bytes_\@
-
-       adr_l           fold_consts_ptr, .Lfold_across_128_bytes_consts
-
-       // Load the first 128 data bytes.  Byte swapping is necessary to make
-       // the bit order match the polynomial coefficient order.
-       ldp             q0, q1, [buf]
-       ldp             q2, q3, [buf, #0x20]
-       ldp             q4, q5, [buf, #0x40]
-       ldp             q6, q7, [buf, #0x60]
-       add             buf, buf, #0x80
-CPU_LE(        rev64           v0.16b, v0.16b                  )
-CPU_LE(        rev64           v1.16b, v1.16b                  )
-CPU_LE(        rev64           v2.16b, v2.16b                  )
-CPU_LE(        rev64           v3.16b, v3.16b                  )
-CPU_LE(        rev64           v4.16b, v4.16b                  )
-CPU_LE(        rev64           v5.16b, v5.16b                  )
-CPU_LE(        rev64           v6.16b, v6.16b                  )
-CPU_LE(        rev64           v7.16b, v7.16b                  )
-CPU_LE(        ext             v0.16b, v0.16b, v0.16b, #8      )
-CPU_LE(        ext             v1.16b, v1.16b, v1.16b, #8      )
-CPU_LE(        ext             v2.16b, v2.16b, v2.16b, #8      )
-CPU_LE(        ext             v3.16b, v3.16b, v3.16b, #8      )
-CPU_LE(        ext             v4.16b, v4.16b, v4.16b, #8      )
-CPU_LE(        ext             v5.16b, v5.16b, v5.16b, #8      )
-CPU_LE(        ext             v6.16b, v6.16b, v6.16b, #8      )
-CPU_LE(        ext             v7.16b, v7.16b, v7.16b, #8      )
-
-       // XOR the first 16 data *bits* with the initial CRC value.
-       movi            v8.16b, #0
-       mov             v8.h[7], init_crc
-       eor             v0.16b, v0.16b, v8.16b
-
-       // Load the constants for folding across 128 bytes.
-       ld1             {fold_consts.2d}, [fold_consts_ptr]
-
-       // Subtract 128 for the 128 data bytes just consumed.  Subtract another
-       // 128 to simplify the termination condition of the following loop.
-       sub             len, len, #256
-
-       // While >= 128 data bytes remain (not counting v0-v7), fold the 128
-       // bytes v0-v7 into them, storing the result back into v0-v7.
-.Lfold_128_bytes_loop_\@:
-       fold_32_bytes   \p, v0, v1
-       fold_32_bytes   \p, v2, v3
-       fold_32_bytes   \p, v4, v5
-       fold_32_bytes   \p, v6, v7
-
-       subs            len, len, #128
-       b.ge            .Lfold_128_bytes_loop_\@
-
-       // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
-
-       // Fold across 64 bytes.
-       add             fold_consts_ptr, fold_consts_ptr, #16
-       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-       fold_16_bytes   \p, v0, v4
-       fold_16_bytes   \p, v1, v5
-       fold_16_bytes   \p, v2, v6
-       fold_16_bytes   \p, v3, v7, 1
-       // Fold across 32 bytes.
-       fold_16_bytes   \p, v4, v6
-       fold_16_bytes   \p, v5, v7, 1
-       // Fold across 16 bytes.
-       fold_16_bytes   \p, v6, v7
-
-       // Add 128 to get the correct number of data bytes remaining in 0...127
-       // (not counting v7), following the previous extra subtraction by 128.
-       // Then subtract 16 to simplify the termination condition of the
-       // following loop.
-       adds            len, len, #(128-16)
-
-       // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
-       // into them, storing the result back into v7.
-       b.lt            .Lfold_16_bytes_loop_done_\@
-.Lfold_16_bytes_loop_\@:
-       pmull16x64_\p   fold_consts, v7, v8
-       eor             v7.16b, v7.16b, v8.16b
-       ldr             q0, [buf], #16
-CPU_LE(        rev64           v0.16b, v0.16b                  )
-CPU_LE(        ext             v0.16b, v0.16b, v0.16b, #8      )
-       eor             v7.16b, v7.16b, v0.16b
-       subs            len, len, #16
-       b.ge            .Lfold_16_bytes_loop_\@
-
-.Lfold_16_bytes_loop_done_\@:
-       // Add 16 to get the correct number of data bytes remaining in 0...15
-       // (not counting v7), following the previous extra subtraction by 16.
-       adds            len, len, #16
-       b.eq            .Lreduce_final_16_bytes_\@
-
-.Lhandle_partial_segment_\@:
-       // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
-       // 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
-       // do this without needing a fold constant for each possible 'len',
-       // redivide the bytes into a first chunk of 'len' bytes and a second
-       // chunk of 16 bytes, then fold the first chunk into the second.
-
-       // v0 = last 16 original data bytes
-       add             buf, buf, len
-       ldr             q0, [buf, #-16]
-CPU_LE(        rev64           v0.16b, v0.16b                  )
-CPU_LE(        ext             v0.16b, v0.16b, v0.16b, #8      )
-
-       // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
-       adr_l           x4, .Lbyteshift_table + 16
-       sub             x4, x4, len
-       ld1             {v2.16b}, [x4]
-       tbl             v1.16b, {v7.16b}, v2.16b
-
-       // v3 = first chunk: v7 right-shifted by '16-len' bytes.
-       movi            v3.16b, #0x80
-       eor             v2.16b, v2.16b, v3.16b
-       tbl             v3.16b, {v7.16b}, v2.16b
-
-       // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
-       sshr            v2.16b, v2.16b, #7
-
-       // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
-       // then '16-len' bytes from v1 (high-order bytes).
-       bsl             v2.16b, v1.16b, v0.16b
-
-       // Fold the first chunk into the second chunk, storing the result in v7.
-       pmull16x64_\p   fold_consts, v3, v0
-       eor             v7.16b, v3.16b, v0.16b
-       eor             v7.16b, v7.16b, v2.16b
-       b               .Lreduce_final_16_bytes_\@
-
-.Lless_than_256_bytes_\@:
-       // Checksumming a buffer of length 16...255 bytes
-
-       adr_l           fold_consts_ptr, .Lfold_across_16_bytes_consts
-
-       // Load the first 16 data bytes.
-       ldr             q7, [buf], #0x10
-CPU_LE(        rev64           v7.16b, v7.16b                  )
-CPU_LE(        ext             v7.16b, v7.16b, v7.16b, #8      )
-
-       // XOR the first 16 data *bits* with the initial CRC value.
-       movi            v0.16b, #0
-       mov             v0.h[7], init_crc
-       eor             v7.16b, v7.16b, v0.16b
-
-       // Load the fold-across-16-bytes constants.
-       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-
-       cmp             len, #16
-       b.eq            .Lreduce_final_16_bytes_\@      // len == 16
-       subs            len, len, #32
-       b.ge            .Lfold_16_bytes_loop_\@         // 32 <= len <= 255
-       add             len, len, #16
-       b               .Lhandle_partial_segment_\@     // 17 <= len <= 31
-
-.Lreduce_final_16_bytes_\@:
-       .endm
-
-//
-// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p8)
-       frame_push      1
-
-       // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
-       movi            perm.4h, #8, lsl #8
-       orr             perm.2s, #1, lsl #16
-       orr             perm.2s, #1, lsl #24
-       zip1            perm.16b, perm.16b, perm.16b
-       zip1            perm.16b, perm.16b, perm.16b
-
-       crc_t10dif_pmull p8
-
-CPU_LE(        rev64           v7.16b, v7.16b                  )
-CPU_LE(        ext             v7.16b, v7.16b, v7.16b, #8      )
-       str             q7, [x3]
-
-       frame_pop
-       ret
-SYM_FUNC_END(crc_t10dif_pmull_p8)
-
-       .align          5
-//
-// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p64)
-       crc_t10dif_pmull        p64
-
-       // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
-
-       movi            v2.16b, #0              // init zero register
-
-       // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
-       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-
-       // Fold the high 64 bits into the low 64 bits, while also multiplying by
-       // x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
-       // whose low 48 bits are 0.
-       ext             v0.16b, v2.16b, v7.16b, #8
-       pmull2          v7.1q, v7.2d, fold_consts.2d    // high bits * x^48 * (x^80 mod G(x))
-       eor             v0.16b, v0.16b, v7.16b          // + low bits * x^64
-
-       // Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
-       // value congruent to x^64 * M(x) and whose low 48 bits are 0.
-       ext             v1.16b, v0.16b, v2.16b, #12     // extract high 32 bits
-       mov             v0.s[3], v2.s[0]                // zero high 32 bits
-       pmull           v1.1q, v1.1d, fold_consts.1d    // high 32 bits * x^48 * (x^48 mod G(x))
-       eor             v0.16b, v0.16b, v1.16b          // + low bits
-
-       // Load G(x) and floor(x^48 / G(x)).
-       ld1             {fold_consts.2d}, [fold_consts_ptr]
-
-       // Use Barrett reduction to compute the final CRC value.
-       pmull2          v1.1q, v0.2d, fold_consts.2d    // high 32 bits * floor(x^48 / G(x))
-       ushr            v1.2d, v1.2d, #32               // /= x^32
-       pmull           v1.1q, v1.1d, fold_consts.1d    // *= G(x)
-       ushr            v0.2d, v0.2d, #48
-       eor             v0.16b, v0.16b, v1.16b          // + low 16 nonzero bits
-       // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
-
-       umov            w0, v0.h[0]
-       ret
-SYM_FUNC_END(crc_t10dif_pmull_p64)
-
-       .section        ".rodata", "a"
-       .align          4
-
-// Fold constants precomputed from the polynomial 0x18bb7
-// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
-.Lfold_across_128_bytes_consts:
-       .quad           0x0000000000006123      // x^(8*128)    mod G(x)
-       .quad           0x0000000000002295      // x^(8*128+64) mod G(x)
-// .Lfold_across_64_bytes_consts:
-       .quad           0x0000000000001069      // x^(4*128)    mod G(x)
-       .quad           0x000000000000dd31      // x^(4*128+64) mod G(x)
-// .Lfold_across_32_bytes_consts:
-       .quad           0x000000000000857d      // x^(2*128)    mod G(x)
-       .quad           0x0000000000007acc      // x^(2*128+64) mod G(x)
-.Lfold_across_16_bytes_consts:
-       .quad           0x000000000000a010      // x^(1*128)    mod G(x)
-       .quad           0x0000000000001faa      // x^(1*128+64) mod G(x)
-// .Lfinal_fold_consts:
-       .quad           0x1368000000000000      // x^48 * (x^48 mod G(x))
-       .quad           0x2d56000000000000      // x^48 * (x^80 mod G(x))
-// .Lbarrett_reduction_consts:
-       .quad           0x0000000000018bb7      // G(x)
-       .quad           0x00000001f65a57f8      // floor(x^48 / G(x))
-
-// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
-// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
-// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.Lbyteshift_table:
-       .byte            0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
-       .byte           0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
-       .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
-       .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c

deleted file mode 100644 (file)

index 08bcbd8..0000000
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc-t10dif.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-#define CRC_T10DIF_PMULL_CHUNK_SIZE    16U
-
-asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
-                                   u8 out[16]);
-asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-
-static int crct10dif_init(struct shash_desc *desc)
-{
-       u16 *crc = shash_desc_ctx(desc);
-
-       *crc = 0;
-       return 0;
-}
-
-static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
-                           unsigned int length)
-{
-       u16 *crcp = shash_desc_ctx(desc);
-       u16 crc = *crcp;
-       u8 buf[16];
-
-       if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
-               kernel_neon_begin();
-               crc_t10dif_pmull_p8(crc, data, length, buf);
-               kernel_neon_end();
-
-               crc = 0;
-               data = buf;
-               length = sizeof(buf);
-       }
-
-       *crcp = crc_t10dif_generic(crc, data, length);
-       return 0;
-}
-
-static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
-                           unsigned int length)
-{
-       u16 *crc = shash_desc_ctx(desc);
-
-       if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
-               kernel_neon_begin();
-               *crc = crc_t10dif_pmull_p64(*crc, data, length);
-               kernel_neon_end();
-       } else {
-               *crc = crc_t10dif_generic(*crc, data, length);
-       }
-
-       return 0;
-}
-
-static int crct10dif_final(struct shash_desc *desc, u8 *out)
-{
-       u16 *crc = shash_desc_ctx(desc);
-
-       *(u16 *)out = *crc;
-       return 0;
-}
-
-static struct shash_alg crc_t10dif_alg[] = {{
-       .digestsize             = CRC_T10DIF_DIGEST_SIZE,
-       .init                   = crct10dif_init,
-       .update                 = crct10dif_update_pmull_p8,
-       .final                  = crct10dif_final,
-       .descsize               = CRC_T10DIF_DIGEST_SIZE,
-
-       .base.cra_name          = "crct10dif",
-       .base.cra_driver_name   = "crct10dif-arm64-neon",
-       .base.cra_priority      = 150,
-       .base.cra_blocksize     = CRC_T10DIF_BLOCK_SIZE,
-       .base.cra_module        = THIS_MODULE,
-}, {
-       .digestsize             = CRC_T10DIF_DIGEST_SIZE,
-       .init                   = crct10dif_init,
-       .update                 = crct10dif_update_pmull_p64,
-       .final                  = crct10dif_final,
-       .descsize               = CRC_T10DIF_DIGEST_SIZE,
-
-       .base.cra_name          = "crct10dif",
-       .base.cra_driver_name   = "crct10dif-arm64-ce",
-       .base.cra_priority      = 200,
-       .base.cra_blocksize     = CRC_T10DIF_BLOCK_SIZE,
-       .base.cra_module        = THIS_MODULE,
-}};
-
-static int __init crc_t10dif_mod_init(void)
-{
-       if (cpu_have_named_feature(PMULL))
-               return crypto_register_shashes(crc_t10dif_alg,
-                                              ARRAY_SIZE(crc_t10dif_alg));
-       else
-               /* only register the first array element */
-               return crypto_register_shash(crc_t10dif_alg);
-}
-
-static void __exit crc_t10dif_mod_exit(void)
-{
-       if (cpu_have_named_feature(PMULL))
-               crypto_unregister_shashes(crc_t10dif_alg,
-                                         ARRAY_SIZE(crc_t10dif_alg));
-       else
-               crypto_unregister_shash(crc_t10dif_alg);
-}
-
-module_cpu_feature_match(ASIMD, crc_t10dif_mod_init);
-module_exit(crc_t10dif_mod_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("crct10dif");
-MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce");
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile

index 5fbcf0d5666550503a0ab6a84632b741dd827cde..4d49dff721a84e5e6cd4f28ed727d7002adc946a 100644 (file)
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -16,6 +16,9 @@ lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
  obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o
  crc32-arm64-y := crc32.o crc32-glue.o
  
+obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm64.o
+crc-t10dif-arm64-y := crc-t10dif-glue.o crc-t10dif-core.o
+
  obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
  
  obj-$(CONFIG_ARM64_MTE) += mte.o
diff --git a/arch/arm64/lib/crc-t10dif-core.S b/arch/arm64/lib/crc-t10dif-core.S

new file mode 100644 (file)

index 0000000..87dd6d4
--- /dev/null
+++ b/arch/arm64/lib/crc-t10dif-core.S
@@ -0,0 +1,469 @@
+//
+// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd
+// Copyright (C) 2019-2024 Google LLC
+//
+// Authors: Ard Biesheuvel <ardb@google.com>
+//          Eric Biggers <ebiggers@google.com>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+// Derived from the x86 version:
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+//     Erdinc Ozturk <erdinc.ozturk@intel.com>
+//     Vinodh Gopal <vinodh.gopal@intel.com>
+//     James Guilford <james.guilford@intel.com>
+//     Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses.  You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//       Reference paper titled "Fast CRC Computation for Generic
+//     Polynomials Using PCLMULQDQ Instruction"
+//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+       .text
+       .arch           armv8-a+crypto
+
+       init_crc        .req    w0
+       buf             .req    x1
+       len             .req    x2
+       fold_consts_ptr .req    x5
+
+       fold_consts     .req    v10
+
+       t3              .req    v17
+       t4              .req    v18
+       t5              .req    v19
+       t6              .req    v20
+       t7              .req    v21
+       t8              .req    v22
+
+       perm            .req    v27
+
+       .macro          pmull16x64_p64, a16, b64, c64
+       pmull2          \c64\().1q, \a16\().2d, \b64\().2d
+       pmull           \b64\().1q, \a16\().1d, \b64\().1d
+       .endm
+
+       /*
+        * Pairwise long polynomial multiplication of two 16-bit values
+        *
+        *   { w0, w1 }, { y0, y1 }
+        *
+        * by two 64-bit values
+        *
+        *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+        *
+        * where each vector element is a byte, ordered from least to most
+        * significant.
+        *
+        * This can be implemented using 8x8 long polynomial multiplication, by
+        * reorganizing the input so that each pairwise 8x8 multiplication
+        * produces one of the terms from the decomposition below, and
+        * combining the results of each rank and shifting them into place.
+        *
+        * Rank
+        *  0            w0*x0 ^              |        y0*z0 ^
+        *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
+        *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
+        *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
+        *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
+        *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
+        *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
+        *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
+        *  8            w1*x7      << 64     |        y1*z7      << 64
+        *
+        * The inputs can be reorganized into
+        *
+        *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+        *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+        *
+        * and after performing 8x8->16 bit long polynomial multiplication of
+        * each of the halves of the first vector with those of the second one,
+        * we obtain the following four vectors of 16-bit elements:
+        *
+        *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+        *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+        *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+        *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+        *
+        * Results b and c can be XORed together, as the vector elements have
+        * matching ranks. Then, the final XOR (*) can be pulled forward, and
+        * applied between the halves of each of the remaining three vectors,
+        * which are then shifted into place, and combined to produce two
+        * 80-bit results.
+        *
+        * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
+        * to the 64x64 bit one above, but XOR'ing the outputs together will
+        * produce the expected result, and this is sufficient in the context of
+        * this algorithm.
+        */
+       .macro          pmull16x64_p8, a16, b64, c64
+       ext             t7.16b, \b64\().16b, \b64\().16b, #1
+       tbl             t5.16b, {\a16\().16b}, perm.16b
+       uzp1            t7.16b, \b64\().16b, t7.16b
+       bl              __pmull_p8_16x64
+       ext             \b64\().16b, t4.16b, t4.16b, #15
+       eor             \c64\().16b, t8.16b, t5.16b
+       .endm
+
+SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
+       ext             t6.16b, t5.16b, t5.16b, #8
+
+       pmull           t3.8h, t7.8b, t5.8b
+       pmull           t4.8h, t7.8b, t6.8b
+       pmull2          t5.8h, t7.16b, t5.16b
+       pmull2          t6.8h, t7.16b, t6.16b
+
+       ext             t8.16b, t3.16b, t3.16b, #8
+       eor             t4.16b, t4.16b, t6.16b
+       ext             t7.16b, t5.16b, t5.16b, #8
+       ext             t6.16b, t4.16b, t4.16b, #8
+       eor             t8.8b, t8.8b, t3.8b
+       eor             t5.8b, t5.8b, t7.8b
+       eor             t4.8b, t4.8b, t6.8b
+       ext             t5.16b, t5.16b, t5.16b, #14
+       ret
+SYM_FUNC_END(__pmull_p8_16x64)
+
+
+       // Fold reg1, reg2 into the next 32 data bytes, storing the result back
+       // into reg1, reg2.
+       .macro          fold_32_bytes, p, reg1, reg2
+       ldp             q11, q12, [buf], #0x20
+
+       pmull16x64_\p   fold_consts, \reg1, v8
+
+CPU_LE(        rev64           v11.16b, v11.16b                )
+CPU_LE(        rev64           v12.16b, v12.16b                )
+
+       pmull16x64_\p   fold_consts, \reg2, v9
+
+CPU_LE(        ext             v11.16b, v11.16b, v11.16b, #8   )
+CPU_LE(        ext             v12.16b, v12.16b, v12.16b, #8   )
+
+       eor             \reg1\().16b, \reg1\().16b, v8.16b
+       eor             \reg2\().16b, \reg2\().16b, v9.16b
+       eor             \reg1\().16b, \reg1\().16b, v11.16b
+       eor             \reg2\().16b, \reg2\().16b, v12.16b
+       .endm
+
+       // Fold src_reg into dst_reg, optionally loading the next fold constants
+       .macro          fold_16_bytes, p, src_reg, dst_reg, load_next_consts
+       pmull16x64_\p   fold_consts, \src_reg, v8
+       .ifnb           \load_next_consts
+       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
+       .endif
+       eor             \dst_reg\().16b, \dst_reg\().16b, v8.16b
+       eor             \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
+       .endm
+
+       .macro          crc_t10dif_pmull, p
+
+       // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+       cmp             len, #256
+       b.lt            .Lless_than_256_bytes_\@
+
+       adr_l           fold_consts_ptr, .Lfold_across_128_bytes_consts
+
+       // Load the first 128 data bytes.  Byte swapping is necessary to make
+       // the bit order match the polynomial coefficient order.
+       ldp             q0, q1, [buf]
+       ldp             q2, q3, [buf, #0x20]
+       ldp             q4, q5, [buf, #0x40]
+       ldp             q6, q7, [buf, #0x60]
+       add             buf, buf, #0x80
+CPU_LE(        rev64           v0.16b, v0.16b                  )
+CPU_LE(        rev64           v1.16b, v1.16b                  )
+CPU_LE(        rev64           v2.16b, v2.16b                  )
+CPU_LE(        rev64           v3.16b, v3.16b                  )
+CPU_LE(        rev64           v4.16b, v4.16b                  )
+CPU_LE(        rev64           v5.16b, v5.16b                  )
+CPU_LE(        rev64           v6.16b, v6.16b                  )
+CPU_LE(        rev64           v7.16b, v7.16b                  )
+CPU_LE(        ext             v0.16b, v0.16b, v0.16b, #8      )
+CPU_LE(        ext             v1.16b, v1.16b, v1.16b, #8      )
+CPU_LE(        ext             v2.16b, v2.16b, v2.16b, #8      )
+CPU_LE(        ext             v3.16b, v3.16b, v3.16b, #8      )
+CPU_LE(        ext             v4.16b, v4.16b, v4.16b, #8      )
+CPU_LE(        ext             v5.16b, v5.16b, v5.16b, #8      )
+CPU_LE(        ext             v6.16b, v6.16b, v6.16b, #8      )
+CPU_LE(        ext             v7.16b, v7.16b, v7.16b, #8      )
+
+       // XOR the first 16 data *bits* with the initial CRC value.
+       movi            v8.16b, #0
+       mov             v8.h[7], init_crc
+       eor             v0.16b, v0.16b, v8.16b
+
+       // Load the constants for folding across 128 bytes.
+       ld1             {fold_consts.2d}, [fold_consts_ptr]
+
+       // Subtract 128 for the 128 data bytes just consumed.  Subtract another
+       // 128 to simplify the termination condition of the following loop.
+       sub             len, len, #256
+
+       // While >= 128 data bytes remain (not counting v0-v7), fold the 128
+       // bytes v0-v7 into them, storing the result back into v0-v7.
+.Lfold_128_bytes_loop_\@:
+       fold_32_bytes   \p, v0, v1
+       fold_32_bytes   \p, v2, v3
+       fold_32_bytes   \p, v4, v5
+       fold_32_bytes   \p, v6, v7
+
+       subs            len, len, #128
+       b.ge            .Lfold_128_bytes_loop_\@
+
+       // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
+
+       // Fold across 64 bytes.
+       add             fold_consts_ptr, fold_consts_ptr, #16
+       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
+       fold_16_bytes   \p, v0, v4
+       fold_16_bytes   \p, v1, v5
+       fold_16_bytes   \p, v2, v6
+       fold_16_bytes   \p, v3, v7, 1
+       // Fold across 32 bytes.
+       fold_16_bytes   \p, v4, v6
+       fold_16_bytes   \p, v5, v7, 1
+       // Fold across 16 bytes.
+       fold_16_bytes   \p, v6, v7
+
+       // Add 128 to get the correct number of data bytes remaining in 0...127
+       // (not counting v7), following the previous extra subtraction by 128.
+       // Then subtract 16 to simplify the termination condition of the
+       // following loop.
+       adds            len, len, #(128-16)
+
+       // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
+       // into them, storing the result back into v7.
+       b.lt            .Lfold_16_bytes_loop_done_\@
+.Lfold_16_bytes_loop_\@:
+       pmull16x64_\p   fold_consts, v7, v8
+       eor             v7.16b, v7.16b, v8.16b
+       ldr             q0, [buf], #16
+CPU_LE(        rev64           v0.16b, v0.16b                  )
+CPU_LE(        ext             v0.16b, v0.16b, v0.16b, #8      )
+       eor             v7.16b, v7.16b, v0.16b
+       subs            len, len, #16
+       b.ge            .Lfold_16_bytes_loop_\@
+
+.Lfold_16_bytes_loop_done_\@:
+       // Add 16 to get the correct number of data bytes remaining in 0...15
+       // (not counting v7), following the previous extra subtraction by 16.
+       adds            len, len, #16
+       b.eq            .Lreduce_final_16_bytes_\@
+
+.Lhandle_partial_segment_\@:
+       // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+       // 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
+       // do this without needing a fold constant for each possible 'len',
+       // redivide the bytes into a first chunk of 'len' bytes and a second
+       // chunk of 16 bytes, then fold the first chunk into the second.
+
+       // v0 = last 16 original data bytes
+       add             buf, buf, len
+       ldr             q0, [buf, #-16]
+CPU_LE(        rev64           v0.16b, v0.16b                  )
+CPU_LE(        ext             v0.16b, v0.16b, v0.16b, #8      )
+
+       // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
+       adr_l           x4, .Lbyteshift_table + 16
+       sub             x4, x4, len
+       ld1             {v2.16b}, [x4]
+       tbl             v1.16b, {v7.16b}, v2.16b
+
+       // v3 = first chunk: v7 right-shifted by '16-len' bytes.
+       movi            v3.16b, #0x80
+       eor             v2.16b, v2.16b, v3.16b
+       tbl             v3.16b, {v7.16b}, v2.16b
+
+       // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+       sshr            v2.16b, v2.16b, #7
+
+       // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
+       // then '16-len' bytes from v1 (high-order bytes).
+       bsl             v2.16b, v1.16b, v0.16b
+
+       // Fold the first chunk into the second chunk, storing the result in v7.
+       pmull16x64_\p   fold_consts, v3, v0
+       eor             v7.16b, v3.16b, v0.16b
+       eor             v7.16b, v7.16b, v2.16b
+       b               .Lreduce_final_16_bytes_\@
+
+.Lless_than_256_bytes_\@:
+       // Checksumming a buffer of length 16...255 bytes
+
+       adr_l           fold_consts_ptr, .Lfold_across_16_bytes_consts
+
+       // Load the first 16 data bytes.
+       ldr             q7, [buf], #0x10
+CPU_LE(        rev64           v7.16b, v7.16b                  )
+CPU_LE(        ext             v7.16b, v7.16b, v7.16b, #8      )
+
+       // XOR the first 16 data *bits* with the initial CRC value.
+       movi            v0.16b, #0
+       mov             v0.h[7], init_crc
+       eor             v7.16b, v7.16b, v0.16b
+
+       // Load the fold-across-16-bytes constants.
+       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
+
+       cmp             len, #16
+       b.eq            .Lreduce_final_16_bytes_\@      // len == 16
+       subs            len, len, #32
+       b.ge            .Lfold_16_bytes_loop_\@         // 32 <= len <= 255
+       add             len, len, #16
+       b               .Lhandle_partial_segment_\@     // 17 <= len <= 31
+
+.Lreduce_final_16_bytes_\@:
+       .endm
+
+//
+// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p8)
+       frame_push      1
+
+       // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
+       movi            perm.4h, #8, lsl #8
+       orr             perm.2s, #1, lsl #16
+       orr             perm.2s, #1, lsl #24
+       zip1            perm.16b, perm.16b, perm.16b
+       zip1            perm.16b, perm.16b, perm.16b
+
+       crc_t10dif_pmull p8
+
+CPU_LE(        rev64           v7.16b, v7.16b                  )
+CPU_LE(        ext             v7.16b, v7.16b, v7.16b, #8      )
+       str             q7, [x3]
+
+       frame_pop
+       ret
+SYM_FUNC_END(crc_t10dif_pmull_p8)
+
+       .align          5
+//
+// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p64)
+       crc_t10dif_pmull        p64
+
+       // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+       movi            v2.16b, #0              // init zero register
+
+       // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
+
+       // Fold the high 64 bits into the low 64 bits, while also multiplying by
+       // x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
+       // whose low 48 bits are 0.
+       ext             v0.16b, v2.16b, v7.16b, #8
+       pmull2          v7.1q, v7.2d, fold_consts.2d    // high bits * x^48 * (x^80 mod G(x))
+       eor             v0.16b, v0.16b, v7.16b          // + low bits * x^64
+
+       // Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
+       // value congruent to x^64 * M(x) and whose low 48 bits are 0.
+       ext             v1.16b, v0.16b, v2.16b, #12     // extract high 32 bits
+       mov             v0.s[3], v2.s[0]                // zero high 32 bits
+       pmull           v1.1q, v1.1d, fold_consts.1d    // high 32 bits * x^48 * (x^48 mod G(x))
+       eor             v0.16b, v0.16b, v1.16b          // + low bits
+
+       // Load G(x) and floor(x^48 / G(x)).
+       ld1             {fold_consts.2d}, [fold_consts_ptr]
+
+       // Use Barrett reduction to compute the final CRC value.
+       pmull2          v1.1q, v0.2d, fold_consts.2d    // high 32 bits * floor(x^48 / G(x))
+       ushr            v1.2d, v1.2d, #32               // /= x^32
+       pmull           v1.1q, v1.1d, fold_consts.1d    // *= G(x)
+       ushr            v0.2d, v0.2d, #48
+       eor             v0.16b, v0.16b, v1.16b          // + low 16 nonzero bits
+       // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+       umov            w0, v0.h[0]
+       ret
+SYM_FUNC_END(crc_t10dif_pmull_p64)
+
+       .section        ".rodata", "a"
+       .align          4
+
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+       .quad           0x0000000000006123      // x^(8*128)    mod G(x)
+       .quad           0x0000000000002295      // x^(8*128+64) mod G(x)
+// .Lfold_across_64_bytes_consts:
+       .quad           0x0000000000001069      // x^(4*128)    mod G(x)
+       .quad           0x000000000000dd31      // x^(4*128+64) mod G(x)
+// .Lfold_across_32_bytes_consts:
+       .quad           0x000000000000857d      // x^(2*128)    mod G(x)
+       .quad           0x0000000000007acc      // x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+       .quad           0x000000000000a010      // x^(1*128)    mod G(x)
+       .quad           0x0000000000001faa      // x^(1*128+64) mod G(x)
+// .Lfinal_fold_consts:
+       .quad           0x1368000000000000      // x^48 * (x^48 mod G(x))
+       .quad           0x2d56000000000000      // x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+       .quad           0x0000000000018bb7      // G(x)
+       .quad           0x00000001f65a57f8      // floor(x^48 / G(x))
+
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
+       .byte            0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+       .byte           0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+       .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
+       .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
diff --git a/arch/arm64/lib/crc-t10dif-glue.c b/arch/arm64/lib/crc-t10dif-glue.c

new file mode 100644 (file)

index 0000000..dab7e37
--- /dev/null
+++ b/arch/arm64/lib/crc-t10dif-glue.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/crc-t10dif.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/simd.h>
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static DEFINE_STATIC_KEY_FALSE(have_asimd);
+static DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+#define CRC_T10DIF_PMULL_CHUNK_SIZE    16U
+
+asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
+                                   u8 out[16]);
+asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+
+u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
+{
+       if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
+               if (static_branch_likely(&have_pmull)) {
+                       if (crypto_simd_usable()) {
+                               kernel_neon_begin();
+                               crc = crc_t10dif_pmull_p64(crc, data, length);
+                               kernel_neon_end();
+                               return crc;
+                       }
+               } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
+                          static_branch_likely(&have_asimd) &&
+                          crypto_simd_usable()) {
+                       u8 buf[16];
+
+                       kernel_neon_begin();
+                       crc_t10dif_pmull_p8(crc, data, length, buf);
+                       kernel_neon_end();
+
+                       crc = 0;
+                       data = buf;
+                       length = sizeof(buf);
+               }
+       }
+       return crc_t10dif_generic(crc, data, length);
+}
+EXPORT_SYMBOL(crc_t10dif_arch);
+
+static int __init crc_t10dif_arm64_init(void)
+{
+       if (cpu_have_named_feature(ASIMD)) {
+               static_branch_enable(&have_asimd);
+               if (cpu_have_named_feature(PMULL))
+                       static_branch_enable(&have_pmull);
+       }
+       return 0;
+}
+arch_initcall(crc_t10dif_arm64_init);
+
+static void __exit crc_t10dif_arm64_exit(void)
+{
+}
+module_exit(crc_t10dif_arm64_exit);
+
+bool crc_t10dif_is_optimized(void)
+{
+       return static_key_enabled(&have_asimd);
+}
+EXPORT_SYMBOL(crc_t10dif_is_optimized);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions");
+MODULE_LICENSE("GPL v2");
diff --git a/tools/testing/selftests/arm64/fp/kernel-test.c b/tools/testing/selftests/arm64/fp/kernel-test.c

index 859345379044fc287458644309d66cf5f3d8bdf5..348e8bef62c7a0fc8225ccec6d881c7db0231626 100644 (file)
--- a/tools/testing/selftests/arm64/fp/kernel-test.c
+++ b/tools/testing/selftests/arm64/fp/kernel-test.c
@@ -46,8 +46,7 @@ static void handle_kick_signal(int sig, siginfo_t *info, void *context)
  }
  
  static char *drivers[] = {
-       "crct10dif-arm64-ce",
-       /* "crct10dif-arm64-neon", - Same priority as generic */
+       "crct10dif-arm64",
         "sha1-ce",
         "sha224-arm64",
         "sha224-arm64-neon",
author	Eric Biggers <ebiggers@google.com>
	Mon, 2 Dec 2024 01:20:50 +0000 (17:20 -0800)
committer	Eric Biggers <ebiggers@google.com>
	Mon, 2 Dec 2024 01:23:13 +0000 (17:23 -0800)
arch/arm64/Kconfig		patch \| blob \| blame \| history
arch/arm64/configs/defconfig		patch \| blob \| blame \| history
arch/arm64/crypto/Kconfig		patch \| blob \| blame \| history
arch/arm64/crypto/Makefile		patch \| blob \| blame \| history
arch/arm64/crypto/crct10dif-ce-core.S	[deleted file]	patch \| blob \| blame \| history
arch/arm64/crypto/crct10dif-ce-glue.c	[deleted file]	patch \| blob \| blame \| history
arch/arm64/lib/Makefile		patch \| blob \| blame \| history
arch/arm64/lib/crc-t10dif-core.S	[new file with mode: 0644]	patch \| blob
arch/arm64/lib/crc-t10dif-glue.c	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/arm64/fp/kernel-test.c		patch \| blob \| blame \| history