arm64/crc-t10dif: expose CRC-T10DIF function through lib
authorEric Biggers <ebiggers@google.com>
Mon, 2 Dec 2024 01:20:50 +0000 (17:20 -0800)
committerEric Biggers <ebiggers@google.com>
Mon, 2 Dec 2024 01:23:13 +0000 (17:23 -0800)
Move the arm64 CRC-T10DIF assembly code into the lib directory and wire
it up to the library interface.  This allows it to be used without going
through the crypto API.  It remains usable via the crypto API too via
the shash algorithms that use the library interface.  Thus all the
arch-specific "shash" code becomes unnecessary and is removed.

Note: to see the diff from arch/arm64/crypto/crct10dif-ce-glue.c to
arch/arm64/lib/crc-t10dif-glue.c, view this commit with 'git show -M10'.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20241202012056.209768-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
arch/arm64/Kconfig
arch/arm64/configs/defconfig
arch/arm64/crypto/Kconfig
arch/arm64/crypto/Makefile
arch/arm64/crypto/crct10dif-ce-core.S [deleted file]
arch/arm64/crypto/crct10dif-ce-glue.c [deleted file]
arch/arm64/lib/Makefile
arch/arm64/lib/crc-t10dif-core.S [new file with mode: 0644]
arch/arm64/lib/crc-t10dif-glue.c [new file with mode: 0644]
tools/testing/selftests/arm64/fp/kernel-test.c

index 71f6310c8240eedb18ebbb66c5a94127937eb92d..cbfd357f94a68db3da9c2e580a36d7a361b4c430 100644 (file)
@@ -22,6 +22,7 @@ config ARM64
        select ARCH_HAS_CACHE_LINE_SIZE
        select ARCH_HAS_CC_PLATFORM
        select ARCH_HAS_CRC32
+       select ARCH_HAS_CRC_T10DIF if KERNEL_MODE_NEON
        select ARCH_HAS_CURRENT_STACK_POINTER
        select ARCH_HAS_DEBUG_VIRTUAL
        select ARCH_HAS_DEBUG_VM_PGTABLE
index c62831e6158633f07c1f3532fba62f09b31e7448..9c0d6b93a3c20d9b56fd9ea7abb53be2af95de61 100644 (file)
@@ -1698,7 +1698,6 @@ CONFIG_CRYPTO_SM3_ARM64_CE=m
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
 CONFIG_CRYPTO_AES_ARM64_BS=m
 CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
-CONFIG_CRYPTO_CRCT10DIF_ARM64_CE=m
 CONFIG_CRYPTO_DEV_SUN8I_CE=m
 CONFIG_CRYPTO_DEV_FSL_CAAM=m
 CONFIG_CRYPTO_DEV_FSL_DPAA2_CAAM=m
index e7d9bd8e4709b6c4562fb07d0f4352a029b6ad06..5636ab83f22aee2428fe712b8d02ae4b77dfdb40 100644 (file)
@@ -312,15 +312,5 @@ config CRYPTO_SM4_ARM64_CE_GCM
          - PMULL (Polynomial Multiply Long) instructions
          - NEON (Advanced SIMD) extensions
 
-config CRYPTO_CRCT10DIF_ARM64_CE
-       tristate "CRCT10DIF (PMULL)"
-       depends on KERNEL_MODE_NEON && CRC_T10DIF
-       select CRYPTO_HASH
-       help
-         CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF)
-
-         Architecture: arm64 using
-         - PMULL (Polynomial Multiply Long) instructions
-
 endmenu
 
index fbe64dce66e0af7d385bbe6496d8cb41b0e02678..e7139c4768ce4e909834bfcaa479f63bcd9a4aa7 100644 (file)
@@ -44,9 +44,6 @@ ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o
 polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
 
-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
-crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
-
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
 
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
deleted file mode 100644 (file)
index 87dd6d4..0000000
+++ /dev/null
@@ -1,469 +0,0 @@
-//
-// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
-//
-// Copyright (C) 2016 Linaro Ltd
-// Copyright (C) 2019-2024 Google LLC
-//
-// Authors: Ard Biesheuvel <ardb@google.com>
-//          Eric Biggers <ebiggers@google.com>
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License version 2 as
-// published by the Free Software Foundation.
-//
-
-// Derived from the x86 version:
-//
-// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
-//
-// Copyright (c) 2013, Intel Corporation
-//
-// Authors:
-//     Erdinc Ozturk <erdinc.ozturk@intel.com>
-//     Vinodh Gopal <vinodh.gopal@intel.com>
-//     James Guilford <james.guilford@intel.com>
-//     Tim Chen <tim.c.chen@linux.intel.com>
-//
-// This software is available to you under a choice of one of two
-// licenses.  You may choose to be licensed under the terms of the GNU
-// General Public License (GPL) Version 2, available from the file
-// COPYING in the main directory of this source tree, or the
-// OpenIB.org BSD license below:
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// * Neither the name of the Intel Corporation nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-//
-// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-//       Reference paper titled "Fast CRC Computation for Generic
-//     Polynomials Using PCLMULQDQ Instruction"
-//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
-//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-//
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-       .text
-       .arch           armv8-a+crypto
-
-       init_crc        .req    w0
-       buf             .req    x1
-       len             .req    x2
-       fold_consts_ptr .req    x5
-
-       fold_consts     .req    v10
-
-       t3              .req    v17
-       t4              .req    v18
-       t5              .req    v19
-       t6              .req    v20
-       t7              .req    v21
-       t8              .req    v22
-
-       perm            .req    v27
-
-       .macro          pmull16x64_p64, a16, b64, c64
-       pmull2          \c64\().1q, \a16\().2d, \b64\().2d
-       pmull           \b64\().1q, \a16\().1d, \b64\().1d
-       .endm
-
-       /*
-        * Pairwise long polynomial multiplication of two 16-bit values
-        *
-        *   { w0, w1 }, { y0, y1 }
-        *
-        * by two 64-bit values
-        *
-        *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
-        *
-        * where each vector element is a byte, ordered from least to most
-        * significant.
-        *
-        * This can be implemented using 8x8 long polynomial multiplication, by
-        * reorganizing the input so that each pairwise 8x8 multiplication
-        * produces one of the terms from the decomposition below, and
-        * combining the results of each rank and shifting them into place.
-        *
-        * Rank
-        *  0            w0*x0 ^              |        y0*z0 ^
-        *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
-        *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
-        *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
-        *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
-        *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
-        *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
-        *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
-        *  8            w1*x7      << 64     |        y1*z7      << 64
-        *
-        * The inputs can be reorganized into
-        *
-        *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
-        *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
-        *
-        * and after performing 8x8->16 bit long polynomial multiplication of
-        * each of the halves of the first vector with those of the second one,
-        * we obtain the following four vectors of 16-bit elements:
-        *
-        *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
-        *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
-        *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
-        *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
-        *
-        * Results b and c can be XORed together, as the vector elements have
-        * matching ranks. Then, the final XOR (*) can be pulled forward, and
-        * applied between the halves of each of the remaining three vectors,
-        * which are then shifted into place, and combined to produce two
-        * 80-bit results.
-        *
-        * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
-        * to the 64x64 bit one above, but XOR'ing the outputs together will
-        * produce the expected result, and this is sufficient in the context of
-        * this algorithm.
-        */
-       .macro          pmull16x64_p8, a16, b64, c64
-       ext             t7.16b, \b64\().16b, \b64\().16b, #1
-       tbl             t5.16b, {\a16\().16b}, perm.16b
-       uzp1            t7.16b, \b64\().16b, t7.16b
-       bl              __pmull_p8_16x64
-       ext             \b64\().16b, t4.16b, t4.16b, #15
-       eor             \c64\().16b, t8.16b, t5.16b
-       .endm
-
-SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
-       ext             t6.16b, t5.16b, t5.16b, #8
-
-       pmull           t3.8h, t7.8b, t5.8b
-       pmull           t4.8h, t7.8b, t6.8b
-       pmull2          t5.8h, t7.16b, t5.16b
-       pmull2          t6.8h, t7.16b, t6.16b
-
-       ext             t8.16b, t3.16b, t3.16b, #8
-       eor             t4.16b, t4.16b, t6.16b
-       ext             t7.16b, t5.16b, t5.16b, #8
-       ext             t6.16b, t4.16b, t4.16b, #8
-       eor             t8.8b, t8.8b, t3.8b
-       eor             t5.8b, t5.8b, t7.8b
-       eor             t4.8b, t4.8b, t6.8b
-       ext             t5.16b, t5.16b, t5.16b, #14
-       ret
-SYM_FUNC_END(__pmull_p8_16x64)
-
-
-       // Fold reg1, reg2 into the next 32 data bytes, storing the result back
-       // into reg1, reg2.
-       .macro          fold_32_bytes, p, reg1, reg2
-       ldp             q11, q12, [buf], #0x20
-
-       pmull16x64_\p   fold_consts, \reg1, v8
-
-CPU_LE(        rev64           v11.16b, v11.16b                )
-CPU_LE(        rev64           v12.16b, v12.16b                )
-
-       pmull16x64_\p   fold_consts, \reg2, v9
-
-CPU_LE(        ext             v11.16b, v11.16b, v11.16b, #8   )
-CPU_LE(        ext             v12.16b, v12.16b, v12.16b, #8   )
-
-       eor             \reg1\().16b, \reg1\().16b, v8.16b
-       eor             \reg2\().16b, \reg2\().16b, v9.16b
-       eor             \reg1\().16b, \reg1\().16b, v11.16b
-       eor             \reg2\().16b, \reg2\().16b, v12.16b
-       .endm
-
-       // Fold src_reg into dst_reg, optionally loading the next fold constants
-       .macro          fold_16_bytes, p, src_reg, dst_reg, load_next_consts
-       pmull16x64_\p   fold_consts, \src_reg, v8
-       .ifnb           \load_next_consts
-       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-       .endif
-       eor             \dst_reg\().16b, \dst_reg\().16b, v8.16b
-       eor             \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
-       .endm
-
-       .macro          crc_t10dif_pmull, p
-
-       // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
-       cmp             len, #256
-       b.lt            .Lless_than_256_bytes_\@
-
-       adr_l           fold_consts_ptr, .Lfold_across_128_bytes_consts
-
-       // Load the first 128 data bytes.  Byte swapping is necessary to make
-       // the bit order match the polynomial coefficient order.
-       ldp             q0, q1, [buf]
-       ldp             q2, q3, [buf, #0x20]
-       ldp             q4, q5, [buf, #0x40]
-       ldp             q6, q7, [buf, #0x60]
-       add             buf, buf, #0x80
-CPU_LE(        rev64           v0.16b, v0.16b                  )
-CPU_LE(        rev64           v1.16b, v1.16b                  )
-CPU_LE(        rev64           v2.16b, v2.16b                  )
-CPU_LE(        rev64           v3.16b, v3.16b                  )
-CPU_LE(        rev64           v4.16b, v4.16b                  )
-CPU_LE(        rev64           v5.16b, v5.16b                  )
-CPU_LE(        rev64           v6.16b, v6.16b                  )
-CPU_LE(        rev64           v7.16b, v7.16b                  )
-CPU_LE(        ext             v0.16b, v0.16b, v0.16b, #8      )
-CPU_LE(        ext             v1.16b, v1.16b, v1.16b, #8      )
-CPU_LE(        ext             v2.16b, v2.16b, v2.16b, #8      )
-CPU_LE(        ext             v3.16b, v3.16b, v3.16b, #8      )
-CPU_LE(        ext             v4.16b, v4.16b, v4.16b, #8      )
-CPU_LE(        ext             v5.16b, v5.16b, v5.16b, #8      )
-CPU_LE(        ext             v6.16b, v6.16b, v6.16b, #8      )
-CPU_LE(        ext             v7.16b, v7.16b, v7.16b, #8      )
-
-       // XOR the first 16 data *bits* with the initial CRC value.
-       movi            v8.16b, #0
-       mov             v8.h[7], init_crc
-       eor             v0.16b, v0.16b, v8.16b
-
-       // Load the constants for folding across 128 bytes.
-       ld1             {fold_consts.2d}, [fold_consts_ptr]
-
-       // Subtract 128 for the 128 data bytes just consumed.  Subtract another
-       // 128 to simplify the termination condition of the following loop.
-       sub             len, len, #256
-
-       // While >= 128 data bytes remain (not counting v0-v7), fold the 128
-       // bytes v0-v7 into them, storing the result back into v0-v7.
-.Lfold_128_bytes_loop_\@:
-       fold_32_bytes   \p, v0, v1
-       fold_32_bytes   \p, v2, v3
-       fold_32_bytes   \p, v4, v5
-       fold_32_bytes   \p, v6, v7
-
-       subs            len, len, #128
-       b.ge            .Lfold_128_bytes_loop_\@
-
-       // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
-
-       // Fold across 64 bytes.
-       add             fold_consts_ptr, fold_consts_ptr, #16
-       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-       fold_16_bytes   \p, v0, v4
-       fold_16_bytes   \p, v1, v5
-       fold_16_bytes   \p, v2, v6
-       fold_16_bytes   \p, v3, v7, 1
-       // Fold across 32 bytes.
-       fold_16_bytes   \p, v4, v6
-       fold_16_bytes   \p, v5, v7, 1
-       // Fold across 16 bytes.
-       fold_16_bytes   \p, v6, v7
-
-       // Add 128 to get the correct number of data bytes remaining in 0...127
-       // (not counting v7), following the previous extra subtraction by 128.
-       // Then subtract 16 to simplify the termination condition of the
-       // following loop.
-       adds            len, len, #(128-16)
-
-       // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
-       // into them, storing the result back into v7.
-       b.lt            .Lfold_16_bytes_loop_done_\@
-.Lfold_16_bytes_loop_\@:
-       pmull16x64_\p   fold_consts, v7, v8
-       eor             v7.16b, v7.16b, v8.16b
-       ldr             q0, [buf], #16
-CPU_LE(        rev64           v0.16b, v0.16b                  )
-CPU_LE(        ext             v0.16b, v0.16b, v0.16b, #8      )
-       eor             v7.16b, v7.16b, v0.16b
-       subs            len, len, #16
-       b.ge            .Lfold_16_bytes_loop_\@
-
-.Lfold_16_bytes_loop_done_\@:
-       // Add 16 to get the correct number of data bytes remaining in 0...15
-       // (not counting v7), following the previous extra subtraction by 16.
-       adds            len, len, #16
-       b.eq            .Lreduce_final_16_bytes_\@
-
-.Lhandle_partial_segment_\@:
-       // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
-       // 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
-       // do this without needing a fold constant for each possible 'len',
-       // redivide the bytes into a first chunk of 'len' bytes and a second
-       // chunk of 16 bytes, then fold the first chunk into the second.
-
-       // v0 = last 16 original data bytes
-       add             buf, buf, len
-       ldr             q0, [buf, #-16]
-CPU_LE(        rev64           v0.16b, v0.16b                  )
-CPU_LE(        ext             v0.16b, v0.16b, v0.16b, #8      )
-
-       // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
-       adr_l           x4, .Lbyteshift_table + 16
-       sub             x4, x4, len
-       ld1             {v2.16b}, [x4]
-       tbl             v1.16b, {v7.16b}, v2.16b
-
-       // v3 = first chunk: v7 right-shifted by '16-len' bytes.
-       movi            v3.16b, #0x80
-       eor             v2.16b, v2.16b, v3.16b
-       tbl             v3.16b, {v7.16b}, v2.16b
-
-       // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
-       sshr            v2.16b, v2.16b, #7
-
-       // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
-       // then '16-len' bytes from v1 (high-order bytes).
-       bsl             v2.16b, v1.16b, v0.16b
-
-       // Fold the first chunk into the second chunk, storing the result in v7.
-       pmull16x64_\p   fold_consts, v3, v0
-       eor             v7.16b, v3.16b, v0.16b
-       eor             v7.16b, v7.16b, v2.16b
-       b               .Lreduce_final_16_bytes_\@
-
-.Lless_than_256_bytes_\@:
-       // Checksumming a buffer of length 16...255 bytes
-
-       adr_l           fold_consts_ptr, .Lfold_across_16_bytes_consts
-
-       // Load the first 16 data bytes.
-       ldr             q7, [buf], #0x10
-CPU_LE(        rev64           v7.16b, v7.16b                  )
-CPU_LE(        ext             v7.16b, v7.16b, v7.16b, #8      )
-
-       // XOR the first 16 data *bits* with the initial CRC value.
-       movi            v0.16b, #0
-       mov             v0.h[7], init_crc
-       eor             v7.16b, v7.16b, v0.16b
-
-       // Load the fold-across-16-bytes constants.
-       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-
-       cmp             len, #16
-       b.eq            .Lreduce_final_16_bytes_\@      // len == 16
-       subs            len, len, #32
-       b.ge            .Lfold_16_bytes_loop_\@         // 32 <= len <= 255
-       add             len, len, #16
-       b               .Lhandle_partial_segment_\@     // 17 <= len <= 31
-
-.Lreduce_final_16_bytes_\@:
-       .endm
-
-//
-// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p8)
-       frame_push      1
-
-       // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
-       movi            perm.4h, #8, lsl #8
-       orr             perm.2s, #1, lsl #16
-       orr             perm.2s, #1, lsl #24
-       zip1            perm.16b, perm.16b, perm.16b
-       zip1            perm.16b, perm.16b, perm.16b
-
-       crc_t10dif_pmull p8
-
-CPU_LE(        rev64           v7.16b, v7.16b                  )
-CPU_LE(        ext             v7.16b, v7.16b, v7.16b, #8      )
-       str             q7, [x3]
-
-       frame_pop
-       ret
-SYM_FUNC_END(crc_t10dif_pmull_p8)
-
-       .align          5
-//
-// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p64)
-       crc_t10dif_pmull        p64
-
-       // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
-
-       movi            v2.16b, #0              // init zero register
-
-       // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
-       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
-
-       // Fold the high 64 bits into the low 64 bits, while also multiplying by
-       // x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
-       // whose low 48 bits are 0.
-       ext             v0.16b, v2.16b, v7.16b, #8
-       pmull2          v7.1q, v7.2d, fold_consts.2d    // high bits * x^48 * (x^80 mod G(x))
-       eor             v0.16b, v0.16b, v7.16b          // + low bits * x^64
-
-       // Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
-       // value congruent to x^64 * M(x) and whose low 48 bits are 0.
-       ext             v1.16b, v0.16b, v2.16b, #12     // extract high 32 bits
-       mov             v0.s[3], v2.s[0]                // zero high 32 bits
-       pmull           v1.1q, v1.1d, fold_consts.1d    // high 32 bits * x^48 * (x^48 mod G(x))
-       eor             v0.16b, v0.16b, v1.16b          // + low bits
-
-       // Load G(x) and floor(x^48 / G(x)).
-       ld1             {fold_consts.2d}, [fold_consts_ptr]
-
-       // Use Barrett reduction to compute the final CRC value.
-       pmull2          v1.1q, v0.2d, fold_consts.2d    // high 32 bits * floor(x^48 / G(x))
-       ushr            v1.2d, v1.2d, #32               // /= x^32
-       pmull           v1.1q, v1.1d, fold_consts.1d    // *= G(x)
-       ushr            v0.2d, v0.2d, #48
-       eor             v0.16b, v0.16b, v1.16b          // + low 16 nonzero bits
-       // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
-
-       umov            w0, v0.h[0]
-       ret
-SYM_FUNC_END(crc_t10dif_pmull_p64)
-
-       .section        ".rodata", "a"
-       .align          4
-
-// Fold constants precomputed from the polynomial 0x18bb7
-// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
-.Lfold_across_128_bytes_consts:
-       .quad           0x0000000000006123      // x^(8*128)    mod G(x)
-       .quad           0x0000000000002295      // x^(8*128+64) mod G(x)
-// .Lfold_across_64_bytes_consts:
-       .quad           0x0000000000001069      // x^(4*128)    mod G(x)
-       .quad           0x000000000000dd31      // x^(4*128+64) mod G(x)
-// .Lfold_across_32_bytes_consts:
-       .quad           0x000000000000857d      // x^(2*128)    mod G(x)
-       .quad           0x0000000000007acc      // x^(2*128+64) mod G(x)
-.Lfold_across_16_bytes_consts:
-       .quad           0x000000000000a010      // x^(1*128)    mod G(x)
-       .quad           0x0000000000001faa      // x^(1*128+64) mod G(x)
-// .Lfinal_fold_consts:
-       .quad           0x1368000000000000      // x^48 * (x^48 mod G(x))
-       .quad           0x2d56000000000000      // x^48 * (x^80 mod G(x))
-// .Lbarrett_reduction_consts:
-       .quad           0x0000000000018bb7      // G(x)
-       .quad           0x00000001f65a57f8      // floor(x^48 / G(x))
-
-// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
-// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
-// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.Lbyteshift_table:
-       .byte            0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
-       .byte           0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
-       .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
-       .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
deleted file mode 100644 (file)
index 08bcbd8..0000000
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc-t10dif.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-#define CRC_T10DIF_PMULL_CHUNK_SIZE    16U
-
-asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
-                                   u8 out[16]);
-asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-
-static int crct10dif_init(struct shash_desc *desc)
-{
-       u16 *crc = shash_desc_ctx(desc);
-
-       *crc = 0;
-       return 0;
-}
-
-static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
-                           unsigned int length)
-{
-       u16 *crcp = shash_desc_ctx(desc);
-       u16 crc = *crcp;
-       u8 buf[16];
-
-       if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
-               kernel_neon_begin();
-               crc_t10dif_pmull_p8(crc, data, length, buf);
-               kernel_neon_end();
-
-               crc = 0;
-               data = buf;
-               length = sizeof(buf);
-       }
-
-       *crcp = crc_t10dif_generic(crc, data, length);
-       return 0;
-}
-
-static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
-                           unsigned int length)
-{
-       u16 *crc = shash_desc_ctx(desc);
-
-       if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
-               kernel_neon_begin();
-               *crc = crc_t10dif_pmull_p64(*crc, data, length);
-               kernel_neon_end();
-       } else {
-               *crc = crc_t10dif_generic(*crc, data, length);
-       }
-
-       return 0;
-}
-
-static int crct10dif_final(struct shash_desc *desc, u8 *out)
-{
-       u16 *crc = shash_desc_ctx(desc);
-
-       *(u16 *)out = *crc;
-       return 0;
-}
-
-static struct shash_alg crc_t10dif_alg[] = {{
-       .digestsize             = CRC_T10DIF_DIGEST_SIZE,
-       .init                   = crct10dif_init,
-       .update                 = crct10dif_update_pmull_p8,
-       .final                  = crct10dif_final,
-       .descsize               = CRC_T10DIF_DIGEST_SIZE,
-
-       .base.cra_name          = "crct10dif",
-       .base.cra_driver_name   = "crct10dif-arm64-neon",
-       .base.cra_priority      = 150,
-       .base.cra_blocksize     = CRC_T10DIF_BLOCK_SIZE,
-       .base.cra_module        = THIS_MODULE,
-}, {
-       .digestsize             = CRC_T10DIF_DIGEST_SIZE,
-       .init                   = crct10dif_init,
-       .update                 = crct10dif_update_pmull_p64,
-       .final                  = crct10dif_final,
-       .descsize               = CRC_T10DIF_DIGEST_SIZE,
-
-       .base.cra_name          = "crct10dif",
-       .base.cra_driver_name   = "crct10dif-arm64-ce",
-       .base.cra_priority      = 200,
-       .base.cra_blocksize     = CRC_T10DIF_BLOCK_SIZE,
-       .base.cra_module        = THIS_MODULE,
-}};
-
-static int __init crc_t10dif_mod_init(void)
-{
-       if (cpu_have_named_feature(PMULL))
-               return crypto_register_shashes(crc_t10dif_alg,
-                                              ARRAY_SIZE(crc_t10dif_alg));
-       else
-               /* only register the first array element */
-               return crypto_register_shash(crc_t10dif_alg);
-}
-
-static void __exit crc_t10dif_mod_exit(void)
-{
-       if (cpu_have_named_feature(PMULL))
-               crypto_unregister_shashes(crc_t10dif_alg,
-                                         ARRAY_SIZE(crc_t10dif_alg));
-       else
-               crypto_unregister_shash(crc_t10dif_alg);
-}
-
-module_cpu_feature_match(ASIMD, crc_t10dif_mod_init);
-module_exit(crc_t10dif_mod_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("crct10dif");
-MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce");
index 5fbcf0d5666550503a0ab6a84632b741dd827cde..4d49dff721a84e5e6cd4f28ed727d7002adc946a 100644 (file)
@@ -16,6 +16,9 @@ lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
 obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o
 crc32-arm64-y := crc32.o crc32-glue.o
 
+obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm64.o
+crc-t10dif-arm64-y := crc-t10dif-glue.o crc-t10dif-core.o
+
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
 obj-$(CONFIG_ARM64_MTE) += mte.o
diff --git a/arch/arm64/lib/crc-t10dif-core.S b/arch/arm64/lib/crc-t10dif-core.S
new file mode 100644 (file)
index 0000000..87dd6d4
--- /dev/null
@@ -0,0 +1,469 @@
+//
+// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd
+// Copyright (C) 2019-2024 Google LLC
+//
+// Authors: Ard Biesheuvel <ardb@google.com>
+//          Eric Biggers <ebiggers@google.com>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+// Derived from the x86 version:
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+//     Erdinc Ozturk <erdinc.ozturk@intel.com>
+//     Vinodh Gopal <vinodh.gopal@intel.com>
+//     James Guilford <james.guilford@intel.com>
+//     Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses.  You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//       Reference paper titled "Fast CRC Computation for Generic
+//     Polynomials Using PCLMULQDQ Instruction"
+//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+       .text
+       .arch           armv8-a+crypto
+
+       init_crc        .req    w0
+       buf             .req    x1
+       len             .req    x2
+       fold_consts_ptr .req    x5
+
+       fold_consts     .req    v10
+
+       t3              .req    v17
+       t4              .req    v18
+       t5              .req    v19
+       t6              .req    v20
+       t7              .req    v21
+       t8              .req    v22
+
+       perm            .req    v27
+
+       .macro          pmull16x64_p64, a16, b64, c64
+       pmull2          \c64\().1q, \a16\().2d, \b64\().2d
+       pmull           \b64\().1q, \a16\().1d, \b64\().1d
+       .endm
+
+       /*
+        * Pairwise long polynomial multiplication of two 16-bit values
+        *
+        *   { w0, w1 }, { y0, y1 }
+        *
+        * by two 64-bit values
+        *
+        *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+        *
+        * where each vector element is a byte, ordered from least to most
+        * significant.
+        *
+        * This can be implemented using 8x8 long polynomial multiplication, by
+        * reorganizing the input so that each pairwise 8x8 multiplication
+        * produces one of the terms from the decomposition below, and
+        * combining the results of each rank and shifting them into place.
+        *
+        * Rank
+        *  0            w0*x0 ^              |        y0*z0 ^
+        *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
+        *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
+        *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
+        *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
+        *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
+        *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
+        *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
+        *  8            w1*x7      << 64     |        y1*z7      << 64
+        *
+        * The inputs can be reorganized into
+        *
+        *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+        *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+        *
+        * and after performing 8x8->16 bit long polynomial multiplication of
+        * each of the halves of the first vector with those of the second one,
+        * we obtain the following four vectors of 16-bit elements:
+        *
+        *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+        *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+        *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+        *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+        *
+        * Results b and c can be XORed together, as the vector elements have
+        * matching ranks. Then, the final XOR (*) can be pulled forward, and
+        * applied between the halves of each of the remaining three vectors,
+        * which are then shifted into place, and combined to produce two
+        * 80-bit results.
+        *
+        * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
+        * to the 64x64 bit one above, but XOR'ing the outputs together will
+        * produce the expected result, and this is sufficient in the context of
+        * this algorithm.
+        */
+       .macro          pmull16x64_p8, a16, b64, c64
+       ext             t7.16b, \b64\().16b, \b64\().16b, #1
+       tbl             t5.16b, {\a16\().16b}, perm.16b
+       uzp1            t7.16b, \b64\().16b, t7.16b
+       bl              __pmull_p8_16x64
+       ext             \b64\().16b, t4.16b, t4.16b, #15
+       eor             \c64\().16b, t8.16b, t5.16b
+       .endm
+
+SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
+       ext             t6.16b, t5.16b, t5.16b, #8
+
+       pmull           t3.8h, t7.8b, t5.8b
+       pmull           t4.8h, t7.8b, t6.8b
+       pmull2          t5.8h, t7.16b, t5.16b
+       pmull2          t6.8h, t7.16b, t6.16b
+
+       ext             t8.16b, t3.16b, t3.16b, #8
+       eor             t4.16b, t4.16b, t6.16b
+       ext             t7.16b, t5.16b, t5.16b, #8
+       ext             t6.16b, t4.16b, t4.16b, #8
+       eor             t8.8b, t8.8b, t3.8b
+       eor             t5.8b, t5.8b, t7.8b
+       eor             t4.8b, t4.8b, t6.8b
+       ext             t5.16b, t5.16b, t5.16b, #14
+       ret
+SYM_FUNC_END(__pmull_p8_16x64)
+
+
+       // Fold reg1, reg2 into the next 32 data bytes, storing the result back
+       // into reg1, reg2.
+       .macro          fold_32_bytes, p, reg1, reg2
+       ldp             q11, q12, [buf], #0x20
+
+       pmull16x64_\p   fold_consts, \reg1, v8
+
+CPU_LE(        rev64           v11.16b, v11.16b                )
+CPU_LE(        rev64           v12.16b, v12.16b                )
+
+       pmull16x64_\p   fold_consts, \reg2, v9
+
+CPU_LE(        ext             v11.16b, v11.16b, v11.16b, #8   )
+CPU_LE(        ext             v12.16b, v12.16b, v12.16b, #8   )
+
+       eor             \reg1\().16b, \reg1\().16b, v8.16b
+       eor             \reg2\().16b, \reg2\().16b, v9.16b
+       eor             \reg1\().16b, \reg1\().16b, v11.16b
+       eor             \reg2\().16b, \reg2\().16b, v12.16b
+       .endm
+
+       // Fold src_reg into dst_reg, optionally loading the next fold constants
+       .macro          fold_16_bytes, p, src_reg, dst_reg, load_next_consts
+       pmull16x64_\p   fold_consts, \src_reg, v8
+       .ifnb           \load_next_consts
+       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
+       .endif
+       eor             \dst_reg\().16b, \dst_reg\().16b, v8.16b
+       eor             \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
+       .endm
+
+       .macro          crc_t10dif_pmull, p
+
+       // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+       cmp             len, #256
+       b.lt            .Lless_than_256_bytes_\@
+
+       adr_l           fold_consts_ptr, .Lfold_across_128_bytes_consts
+
+       // Load the first 128 data bytes.  Byte swapping is necessary to make
+       // the bit order match the polynomial coefficient order.
+       ldp             q0, q1, [buf]
+       ldp             q2, q3, [buf, #0x20]
+       ldp             q4, q5, [buf, #0x40]
+       ldp             q6, q7, [buf, #0x60]
+       add             buf, buf, #0x80
+CPU_LE(        rev64           v0.16b, v0.16b                  )
+CPU_LE(        rev64           v1.16b, v1.16b                  )
+CPU_LE(        rev64           v2.16b, v2.16b                  )
+CPU_LE(        rev64           v3.16b, v3.16b                  )
+CPU_LE(        rev64           v4.16b, v4.16b                  )
+CPU_LE(        rev64           v5.16b, v5.16b                  )
+CPU_LE(        rev64           v6.16b, v6.16b                  )
+CPU_LE(        rev64           v7.16b, v7.16b                  )
+CPU_LE(        ext             v0.16b, v0.16b, v0.16b, #8      )
+CPU_LE(        ext             v1.16b, v1.16b, v1.16b, #8      )
+CPU_LE(        ext             v2.16b, v2.16b, v2.16b, #8      )
+CPU_LE(        ext             v3.16b, v3.16b, v3.16b, #8      )
+CPU_LE(        ext             v4.16b, v4.16b, v4.16b, #8      )
+CPU_LE(        ext             v5.16b, v5.16b, v5.16b, #8      )
+CPU_LE(        ext             v6.16b, v6.16b, v6.16b, #8      )
+CPU_LE(        ext             v7.16b, v7.16b, v7.16b, #8      )
+
+       // XOR the first 16 data *bits* with the initial CRC value.
+       movi            v8.16b, #0
+       mov             v8.h[7], init_crc
+       eor             v0.16b, v0.16b, v8.16b
+
+       // Load the constants for folding across 128 bytes.
+       ld1             {fold_consts.2d}, [fold_consts_ptr]
+
+       // Subtract 128 for the 128 data bytes just consumed.  Subtract another
+       // 128 to simplify the termination condition of the following loop.
+       sub             len, len, #256
+
+       // While >= 128 data bytes remain (not counting v0-v7), fold the 128
+       // bytes v0-v7 into them, storing the result back into v0-v7.
+.Lfold_128_bytes_loop_\@:
+       fold_32_bytes   \p, v0, v1
+       fold_32_bytes   \p, v2, v3
+       fold_32_bytes   \p, v4, v5
+       fold_32_bytes   \p, v6, v7
+
+       subs            len, len, #128
+       b.ge            .Lfold_128_bytes_loop_\@
+
+       // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
+
+       // Fold across 64 bytes.
+       add             fold_consts_ptr, fold_consts_ptr, #16
+       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
+       fold_16_bytes   \p, v0, v4
+       fold_16_bytes   \p, v1, v5
+       fold_16_bytes   \p, v2, v6
+       fold_16_bytes   \p, v3, v7, 1
+       // Fold across 32 bytes.
+       fold_16_bytes   \p, v4, v6
+       fold_16_bytes   \p, v5, v7, 1
+       // Fold across 16 bytes.
+       fold_16_bytes   \p, v6, v7
+
+       // Add 128 to get the correct number of data bytes remaining in 0...127
+       // (not counting v7), following the previous extra subtraction by 128.
+       // Then subtract 16 to simplify the termination condition of the
+       // following loop.
+       adds            len, len, #(128-16)
+
+       // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
+       // into them, storing the result back into v7.
+       b.lt            .Lfold_16_bytes_loop_done_\@
+.Lfold_16_bytes_loop_\@:
+       pmull16x64_\p   fold_consts, v7, v8
+       eor             v7.16b, v7.16b, v8.16b
+       ldr             q0, [buf], #16
+CPU_LE(        rev64           v0.16b, v0.16b                  )
+CPU_LE(        ext             v0.16b, v0.16b, v0.16b, #8      )
+       eor             v7.16b, v7.16b, v0.16b
+       subs            len, len, #16
+       b.ge            .Lfold_16_bytes_loop_\@
+
+.Lfold_16_bytes_loop_done_\@:
+       // Add 16 to get the correct number of data bytes remaining in 0...15
+       // (not counting v7), following the previous extra subtraction by 16.
+       adds            len, len, #16
+       b.eq            .Lreduce_final_16_bytes_\@
+
+.Lhandle_partial_segment_\@:
+       // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+       // 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
+       // do this without needing a fold constant for each possible 'len',
+       // redivide the bytes into a first chunk of 'len' bytes and a second
+       // chunk of 16 bytes, then fold the first chunk into the second.
+
+       // v0 = last 16 original data bytes
+       add             buf, buf, len
+       ldr             q0, [buf, #-16]
+CPU_LE(        rev64           v0.16b, v0.16b                  )
+CPU_LE(        ext             v0.16b, v0.16b, v0.16b, #8      )
+
+       // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
+       adr_l           x4, .Lbyteshift_table + 16
+       sub             x4, x4, len
+       ld1             {v2.16b}, [x4]
+       tbl             v1.16b, {v7.16b}, v2.16b
+
+       // v3 = first chunk: v7 right-shifted by '16-len' bytes.
+       movi            v3.16b, #0x80
+       eor             v2.16b, v2.16b, v3.16b
+       tbl             v3.16b, {v7.16b}, v2.16b
+
+       // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+       sshr            v2.16b, v2.16b, #7
+
+       // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
+       // then '16-len' bytes from v1 (high-order bytes).
+       bsl             v2.16b, v1.16b, v0.16b
+
+       // Fold the first chunk into the second chunk, storing the result in v7.
+       pmull16x64_\p   fold_consts, v3, v0
+       eor             v7.16b, v3.16b, v0.16b
+       eor             v7.16b, v7.16b, v2.16b
+       b               .Lreduce_final_16_bytes_\@
+
+.Lless_than_256_bytes_\@:
+       // Checksumming a buffer of length 16...255 bytes
+
+       adr_l           fold_consts_ptr, .Lfold_across_16_bytes_consts
+
+       // Load the first 16 data bytes.
+       ldr             q7, [buf], #0x10
+CPU_LE(        rev64           v7.16b, v7.16b                  )
+CPU_LE(        ext             v7.16b, v7.16b, v7.16b, #8      )
+
+       // XOR the first 16 data *bits* with the initial CRC value.
+       movi            v0.16b, #0
+       mov             v0.h[7], init_crc
+       eor             v7.16b, v7.16b, v0.16b
+
+       // Load the fold-across-16-bytes constants.
+       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
+
+       cmp             len, #16
+       b.eq            .Lreduce_final_16_bytes_\@      // len == 16
+       subs            len, len, #32
+       b.ge            .Lfold_16_bytes_loop_\@         // 32 <= len <= 255
+       add             len, len, #16
+       b               .Lhandle_partial_segment_\@     // 17 <= len <= 31
+
+.Lreduce_final_16_bytes_\@:
+       .endm
+
+//
+// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p8)
+       frame_push      1
+
+       // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
+       movi            perm.4h, #8, lsl #8
+       orr             perm.2s, #1, lsl #16
+       orr             perm.2s, #1, lsl #24
+       zip1            perm.16b, perm.16b, perm.16b
+       zip1            perm.16b, perm.16b, perm.16b
+
+       crc_t10dif_pmull p8
+
+CPU_LE(        rev64           v7.16b, v7.16b                  )
+CPU_LE(        ext             v7.16b, v7.16b, v7.16b, #8      )
+       str             q7, [x3]
+
+       frame_pop
+       ret
+SYM_FUNC_END(crc_t10dif_pmull_p8)
+
+       .align          5
+//
+// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p64)
+       crc_t10dif_pmull        p64
+
+       // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+       movi            v2.16b, #0              // init zero register
+
+       // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+       ld1             {fold_consts.2d}, [fold_consts_ptr], #16
+
+       // Fold the high 64 bits into the low 64 bits, while also multiplying by
+       // x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
+       // whose low 48 bits are 0.
+       ext             v0.16b, v2.16b, v7.16b, #8
+       pmull2          v7.1q, v7.2d, fold_consts.2d    // high bits * x^48 * (x^80 mod G(x))
+       eor             v0.16b, v0.16b, v7.16b          // + low bits * x^64
+
+       // Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
+       // value congruent to x^64 * M(x) and whose low 48 bits are 0.
+       ext             v1.16b, v0.16b, v2.16b, #12     // extract high 32 bits
+       mov             v0.s[3], v2.s[0]                // zero high 32 bits
+       pmull           v1.1q, v1.1d, fold_consts.1d    // high 32 bits * x^48 * (x^48 mod G(x))
+       eor             v0.16b, v0.16b, v1.16b          // + low bits
+
+       // Load G(x) and floor(x^48 / G(x)).
+       ld1             {fold_consts.2d}, [fold_consts_ptr]
+
+       // Use Barrett reduction to compute the final CRC value.
+       pmull2          v1.1q, v0.2d, fold_consts.2d    // high 32 bits * floor(x^48 / G(x))
+       ushr            v1.2d, v1.2d, #32               // /= x^32
+       pmull           v1.1q, v1.1d, fold_consts.1d    // *= G(x)
+       ushr            v0.2d, v0.2d, #48
+       eor             v0.16b, v0.16b, v1.16b          // + low 16 nonzero bits
+       // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+       umov            w0, v0.h[0]
+       ret
+SYM_FUNC_END(crc_t10dif_pmull_p64)
+
+       .section        ".rodata", "a"
+       .align          4
+
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+       .quad           0x0000000000006123      // x^(8*128)    mod G(x)
+       .quad           0x0000000000002295      // x^(8*128+64) mod G(x)
+// .Lfold_across_64_bytes_consts:
+       .quad           0x0000000000001069      // x^(4*128)    mod G(x)
+       .quad           0x000000000000dd31      // x^(4*128+64) mod G(x)
+// .Lfold_across_32_bytes_consts:
+       .quad           0x000000000000857d      // x^(2*128)    mod G(x)
+       .quad           0x0000000000007acc      // x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+       .quad           0x000000000000a010      // x^(1*128)    mod G(x)
+       .quad           0x0000000000001faa      // x^(1*128+64) mod G(x)
+// .Lfinal_fold_consts:
+       .quad           0x1368000000000000      // x^48 * (x^48 mod G(x))
+       .quad           0x2d56000000000000      // x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+       .quad           0x0000000000018bb7      // G(x)
+       .quad           0x00000001f65a57f8      // floor(x^48 / G(x))
+
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
+       .byte            0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+       .byte           0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+       .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
+       .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
diff --git a/arch/arm64/lib/crc-t10dif-glue.c b/arch/arm64/lib/crc-t10dif-glue.c
new file mode 100644 (file)
index 0000000..dab7e37
--- /dev/null
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/crc-t10dif.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/simd.h>
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static DEFINE_STATIC_KEY_FALSE(have_asimd);
+static DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+#define CRC_T10DIF_PMULL_CHUNK_SIZE    16U
+
+asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
+                                   u8 out[16]);
+asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+
+u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
+{
+       if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
+               if (static_branch_likely(&have_pmull)) {
+                       if (crypto_simd_usable()) {
+                               kernel_neon_begin();
+                               crc = crc_t10dif_pmull_p64(crc, data, length);
+                               kernel_neon_end();
+                               return crc;
+                       }
+               } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
+                          static_branch_likely(&have_asimd) &&
+                          crypto_simd_usable()) {
+                       u8 buf[16];
+
+                       kernel_neon_begin();
+                       crc_t10dif_pmull_p8(crc, data, length, buf);
+                       kernel_neon_end();
+
+                       crc = 0;
+                       data = buf;
+                       length = sizeof(buf);
+               }
+       }
+       return crc_t10dif_generic(crc, data, length);
+}
+EXPORT_SYMBOL(crc_t10dif_arch);
+
+static int __init crc_t10dif_arm64_init(void)
+{
+       if (cpu_have_named_feature(ASIMD)) {
+               static_branch_enable(&have_asimd);
+               if (cpu_have_named_feature(PMULL))
+                       static_branch_enable(&have_pmull);
+       }
+       return 0;
+}
+arch_initcall(crc_t10dif_arm64_init);
+
+static void __exit crc_t10dif_arm64_exit(void)
+{
+}
+module_exit(crc_t10dif_arm64_exit);
+
+bool crc_t10dif_is_optimized(void)
+{
+       return static_key_enabled(&have_asimd);
+}
+EXPORT_SYMBOL(crc_t10dif_is_optimized);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions");
+MODULE_LICENSE("GPL v2");
index 859345379044fc287458644309d66cf5f3d8bdf5..348e8bef62c7a0fc8225ccec6d881c7db0231626 100644 (file)
@@ -46,8 +46,7 @@ static void handle_kick_signal(int sig, siginfo_t *info, void *context)
 }
 
 static char *drivers[] = {
-       "crct10dif-arm64-ce",
-       /* "crct10dif-arm64-neon", - Same priority as generic */
+       "crct10dif-arm64",
        "sha1-ce",
        "sha224-arm64",
        "sha224-arm64-neon",