lib/crypto: powerpc/sha1: Migrate optimized code into library
authorEric Biggers <ebiggers@kernel.org>
Sat, 12 Jul 2025 23:23:01 +0000 (16:23 -0700)
committerEric Biggers <ebiggers@kernel.org>
Mon, 14 Jul 2025 18:11:49 +0000 (11:11 -0700)
Instead of exposing the powerpc-optimized SHA-1 code via
powerpc-specific crypto_shash algorithms, instead just implement the
sha1_blocks() library function.  This is much simpler, it makes the
SHA-1 library functions be powerpc-optimized, and it fixes the
longstanding issue where the powerpc-optimized SHA-1 code was disabled
by default.  SHA-1 still remains available through crypto_shash, but
individual architectures no longer need to handle it.

Note: to see the diff from arch/powerpc/crypto/sha1-spe-glue.c to
lib/crypto/powerpc/sha1.h, view this commit with 'git show -M10'.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250712232329.818226-11-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
14 files changed:
arch/powerpc/configs/44x/akebono_defconfig
arch/powerpc/configs/powernv_defconfig
arch/powerpc/configs/ppc64_defconfig
arch/powerpc/crypto/Kconfig
arch/powerpc/crypto/Makefile
arch/powerpc/crypto/sha1-powerpc-asm.S [deleted file]
arch/powerpc/crypto/sha1-spe-asm.S [deleted file]
arch/powerpc/crypto/sha1-spe-glue.c [deleted file]
arch/powerpc/crypto/sha1.c [deleted file]
lib/crypto/Kconfig
lib/crypto/Makefile
lib/crypto/powerpc/sha1-powerpc-asm.S [new file with mode: 0644]
lib/crypto/powerpc/sha1-spe-asm.S [new file with mode: 0644]
lib/crypto/powerpc/sha1.h [new file with mode: 0644]

index fde4824f235efba835d48c30af6825c4c0ea5e24..1882eb2da354a721a6f67f901a134d7fbfee89e9 100644 (file)
@@ -128,6 +128,5 @@ CONFIG_PPC_EARLY_DEBUG_44x_PHYSLOW=0x00010000
 CONFIG_PPC_EARLY_DEBUG_44x_PHYSHIGH=0x33f
 CONFIG_CRYPTO_PCBC=y
 CONFIG_CRYPTO_MD5=y
-CONFIG_CRYPTO_SHA1_PPC=y
 CONFIG_CRYPTO_DES=y
 # CONFIG_CRYPTO_HW is not set
index 379229c982a493e3fdfad942fd4cc9f3b3effa7c..98f56e63ad21cb7ba5c3ef33f6d6b1190506ef42 100644 (file)
@@ -322,7 +322,6 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_SHA1_PPC=m
 CONFIG_CRYPTO_SHA256=y
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
index 3423c405cad4b777a9d2ff543cb548a0defa3a29..dca67aae5da3c4278bb2bc2e54260648dcd02477 100644 (file)
@@ -388,7 +388,6 @@ CONFIG_CRYPTO_SHA256=y
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_LZO=m
 CONFIG_CRYPTO_MD5_PPC=m
-CONFIG_CRYPTO_SHA1_PPC=m
 CONFIG_CRYPTO_AES_GCM_P10=m
 CONFIG_CRYPTO_DEV_NX=y
 CONFIG_CRYPTO_DEV_NX_ENCRYPT=m
index caaa359f4742039f2cf52a10efb4aed8d325f0b7..cfe39fc221cf81fe54dec028ec445a973bf72929 100644 (file)
@@ -23,22 +23,6 @@ config CRYPTO_MD5_PPC
 
          Architecture: powerpc
 
-config CRYPTO_SHA1_PPC
-       tristate "Hash functions: SHA-1"
-       help
-         SHA-1 secure hash algorithm (FIPS 180)
-
-         Architecture: powerpc
-
-config CRYPTO_SHA1_PPC_SPE
-       tristate "Hash functions: SHA-1 (SPE)"
-       depends on SPE
-       help
-         SHA-1 secure hash algorithm (FIPS 180)
-
-         Architecture: powerpc using
-         - SPE (Signal Processing Engine) extensions
-
 config CRYPTO_AES_PPC_SPE
        tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)"
        depends on SPE
index 8c2936ae466fcb8788a885a4594ad46238d064a5..bc8fd27344b8bbe0a1992416c3b850c98fa26bdb 100644 (file)
@@ -7,16 +7,12 @@
 
 obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o
 obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o
-obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o
-obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o
 obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
 obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
 obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o
 
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
 md5-ppc-y := md5-asm.o md5-glue.o
-sha1-powerpc-y := sha1-powerpc-asm.o sha1.o
-sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
 aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o
 vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o
 curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o
diff --git a/arch/powerpc/crypto/sha1-powerpc-asm.S b/arch/powerpc/crypto/sha1-powerpc-asm.S
deleted file mode 100644 (file)
index f0d5ed5..0000000
+++ /dev/null
@@ -1,188 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * SHA-1 implementation for PowerPC.
- *
- * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
- */
-
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/asm-compat.h>
-
-#ifdef __BIG_ENDIAN__
-#define LWZ(rt, d, ra) \
-       lwz     rt,d(ra)
-#else
-#define LWZ(rt, d, ra) \
-       li      rt,d;   \
-       lwbrx   rt,rt,ra
-#endif
-
-/*
- * We roll the registers for T, A, B, C, D, E around on each
- * iteration; T on iteration t is A on iteration t+1, and so on.
- * We use registers 7 - 12 for this.
- */
-#define RT(t)  ((((t)+5)%6)+7)
-#define RA(t)  ((((t)+4)%6)+7)
-#define RB(t)  ((((t)+3)%6)+7)
-#define RC(t)  ((((t)+2)%6)+7)
-#define RD(t)  ((((t)+1)%6)+7)
-#define RE(t)  ((((t)+0)%6)+7)
-
-/* We use registers 16 - 31 for the W values */
-#define W(t)   (((t)%16)+16)
-
-#define LOADW(t)                               \
-       LWZ(W(t),(t)*4,r4)
-
-#define STEPD0_LOAD(t)                         \
-       andc    r0,RD(t),RB(t);         \
-       and     r6,RB(t),RC(t);         \
-       rotlwi  RT(t),RA(t),5;                  \
-       or      r6,r6,r0;                       \
-       add     r0,RE(t),r15;                   \
-       add     RT(t),RT(t),r6;         \
-       add     r14,r0,W(t);                    \
-       LWZ(W((t)+4),((t)+4)*4,r4);     \
-       rotlwi  RB(t),RB(t),30;                 \
-       add     RT(t),RT(t),r14
-
-#define STEPD0_UPDATE(t)                       \
-       and     r6,RB(t),RC(t);         \
-       andc    r0,RD(t),RB(t);         \
-       rotlwi  RT(t),RA(t),5;                  \
-       rotlwi  RB(t),RB(t),30;                 \
-       or      r6,r6,r0;                       \
-       add     r0,RE(t),r15;                   \
-       xor     r5,W((t)+4-3),W((t)+4-8);               \
-       add     RT(t),RT(t),r6;         \
-       xor     W((t)+4),W((t)+4-16),W((t)+4-14);       \
-       add     r0,r0,W(t);                     \
-       xor     W((t)+4),W((t)+4),r5;                   \
-       add     RT(t),RT(t),r0;         \
-       rotlwi  W((t)+4),W((t)+4),1
-
-#define STEPD1(t)                              \
-       xor     r6,RB(t),RC(t);         \
-       rotlwi  RT(t),RA(t),5;                  \
-       rotlwi  RB(t),RB(t),30;                 \
-       xor     r6,r6,RD(t);                    \
-       add     r0,RE(t),r15;                   \
-       add     RT(t),RT(t),r6;         \
-       add     r0,r0,W(t);                     \
-       add     RT(t),RT(t),r0
-
-#define STEPD1_UPDATE(t)                               \
-       xor     r6,RB(t),RC(t);         \
-       rotlwi  RT(t),RA(t),5;                  \
-       rotlwi  RB(t),RB(t),30;                 \
-       xor     r6,r6,RD(t);                    \
-       add     r0,RE(t),r15;                   \
-       xor     r5,W((t)+4-3),W((t)+4-8);               \
-       add     RT(t),RT(t),r6;         \
-       xor     W((t)+4),W((t)+4-16),W((t)+4-14);       \
-       add     r0,r0,W(t);                     \
-       xor     W((t)+4),W((t)+4),r5;                   \
-       add     RT(t),RT(t),r0;         \
-       rotlwi  W((t)+4),W((t)+4),1
-
-#define STEPD2_UPDATE(t)                       \
-       and     r6,RB(t),RC(t);         \
-       and     r0,RB(t),RD(t);         \
-       rotlwi  RT(t),RA(t),5;                  \
-       or      r6,r6,r0;                       \
-       rotlwi  RB(t),RB(t),30;                 \
-       and     r0,RC(t),RD(t);         \
-       xor     r5,W((t)+4-3),W((t)+4-8);       \
-       or      r6,r6,r0;                       \
-       xor     W((t)+4),W((t)+4-16),W((t)+4-14);       \
-       add     r0,RE(t),r15;                   \
-       add     RT(t),RT(t),r6;         \
-       add     r0,r0,W(t);                     \
-       xor     W((t)+4),W((t)+4),r5;           \
-       add     RT(t),RT(t),r0;         \
-       rotlwi  W((t)+4),W((t)+4),1
-
-#define STEP0LD4(t)                            \
-       STEPD0_LOAD(t);                         \
-       STEPD0_LOAD((t)+1);                     \
-       STEPD0_LOAD((t)+2);                     \
-       STEPD0_LOAD((t)+3)
-
-#define STEPUP4(t, fn)                         \
-       STEP##fn##_UPDATE(t);                   \
-       STEP##fn##_UPDATE((t)+1);               \
-       STEP##fn##_UPDATE((t)+2);               \
-       STEP##fn##_UPDATE((t)+3)
-
-#define STEPUP20(t, fn)                                \
-       STEPUP4(t, fn);                         \
-       STEPUP4((t)+4, fn);                     \
-       STEPUP4((t)+8, fn);                     \
-       STEPUP4((t)+12, fn);                    \
-       STEPUP4((t)+16, fn)
-
-_GLOBAL(powerpc_sha_transform)
-       PPC_STLU r1,-INT_FRAME_SIZE(r1)
-       SAVE_GPRS(14, 31, r1)
-
-       /* Load up A - E */
-       lwz     RA(0),0(r3)     /* A */
-       lwz     RB(0),4(r3)     /* B */
-       lwz     RC(0),8(r3)     /* C */
-       lwz     RD(0),12(r3)    /* D */
-       lwz     RE(0),16(r3)    /* E */
-
-       LOADW(0)
-       LOADW(1)
-       LOADW(2)
-       LOADW(3)
-
-       lis     r15,0x5a82      /* K0-19 */
-       ori     r15,r15,0x7999
-       STEP0LD4(0)
-       STEP0LD4(4)
-       STEP0LD4(8)
-       STEPUP4(12, D0)
-       STEPUP4(16, D0)
-
-       lis     r15,0x6ed9      /* K20-39 */
-       ori     r15,r15,0xeba1
-       STEPUP20(20, D1)
-
-       lis     r15,0x8f1b      /* K40-59 */
-       ori     r15,r15,0xbcdc
-       STEPUP20(40, D2)
-
-       lis     r15,0xca62      /* K60-79 */
-       ori     r15,r15,0xc1d6
-       STEPUP4(60, D1)
-       STEPUP4(64, D1)
-       STEPUP4(68, D1)
-       STEPUP4(72, D1)
-       lwz     r20,16(r3)
-       STEPD1(76)
-       lwz     r19,12(r3)
-       STEPD1(77)
-       lwz     r18,8(r3)
-       STEPD1(78)
-       lwz     r17,4(r3)
-       STEPD1(79)
-
-       lwz     r16,0(r3)
-       add     r20,RE(80),r20
-       add     RD(0),RD(80),r19
-       add     RC(0),RC(80),r18
-       add     RB(0),RB(80),r17
-       add     RA(0),RA(80),r16
-       mr      RE(0),r20
-       stw     RA(0),0(r3)
-       stw     RB(0),4(r3)
-       stw     RC(0),8(r3)
-       stw     RD(0),12(r3)
-       stw     RE(0),16(r3)
-
-       REST_GPRS(14, 31, r1)
-       addi    r1,r1,INT_FRAME_SIZE
-       blr
diff --git a/arch/powerpc/crypto/sha1-spe-asm.S b/arch/powerpc/crypto/sha1-spe-asm.S
deleted file mode 100644 (file)
index 0f44752..0000000
+++ /dev/null
@@ -1,294 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Fast SHA-1 implementation for SPE instruction set (PPC)
- *
- * This code makes use of the SPE SIMD instruction set as defined in
- * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
- * Implementation is based on optimization guide notes from
- * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-
-#define rHP    r3      /* pointer to hash value                        */
-#define rWP    r4      /* pointer to input                             */
-#define rKP    r5      /* pointer to constants                         */
-
-#define rW0    r14     /* 64 bit round words                           */
-#define rW1    r15
-#define rW2    r16
-#define rW3    r17
-#define rW4    r18
-#define rW5    r19
-#define rW6    r20
-#define rW7    r21
-
-#define rH0    r6      /* 32 bit hash values                           */
-#define rH1    r7
-#define rH2    r8
-#define rH3    r9
-#define rH4    r10
-
-#define rT0    r22     /* 64 bit temporary                             */
-#define rT1    r0      /* 32 bit temporaries                           */
-#define rT2    r11
-#define rT3    r12
-
-#define rK     r23     /* 64 bit constant in volatile register         */
-
-#define LOAD_K01
-
-#define LOAD_K11 \
-       evlwwsplat      rK,0(rKP);
-
-#define LOAD_K21 \
-       evlwwsplat      rK,4(rKP);
-
-#define LOAD_K31 \
-       evlwwsplat      rK,8(rKP);
-
-#define LOAD_K41 \
-       evlwwsplat      rK,12(rKP);
-
-#define INITIALIZE \
-       stwu            r1,-128(r1);    /* create stack frame           */ \
-       evstdw          r14,8(r1);      /* We must save non volatile    */ \
-       evstdw          r15,16(r1);     /* registers. Take the chance   */ \
-       evstdw          r16,24(r1);     /* and save the SPE part too    */ \
-       evstdw          r17,32(r1);                                        \
-       evstdw          r18,40(r1);                                        \
-       evstdw          r19,48(r1);                                        \
-       evstdw          r20,56(r1);                                        \
-       evstdw          r21,64(r1);                                        \
-       evstdw          r22,72(r1);                                        \
-       evstdw          r23,80(r1);
-
-
-#define FINALIZE \
-       evldw           r14,8(r1);      /* restore SPE registers        */ \
-       evldw           r15,16(r1);                                        \
-       evldw           r16,24(r1);                                        \
-       evldw           r17,32(r1);                                        \
-       evldw           r18,40(r1);                                        \
-       evldw           r19,48(r1);                                        \
-       evldw           r20,56(r1);                                        \
-       evldw           r21,64(r1);                                        \
-       evldw           r22,72(r1);                                        \
-       evldw           r23,80(r1);                                        \
-       xor             r0,r0,r0;                                          \
-       stw             r0,8(r1);       /* Delete sensitive data        */ \
-       stw             r0,16(r1);      /* that we might have pushed    */ \
-       stw             r0,24(r1);      /* from other context that runs */ \
-       stw             r0,32(r1);      /* the same code. Assume that   */ \
-       stw             r0,40(r1);      /* the lower part of the GPRs   */ \
-       stw             r0,48(r1);      /* were already overwritten on  */ \
-       stw             r0,56(r1);      /* the way down to here         */ \
-       stw             r0,64(r1);                                         \
-       stw             r0,72(r1);                                         \
-       stw             r0,80(r1);                                         \
-       addi            r1,r1,128;      /* cleanup stack frame          */
-
-#ifdef __BIG_ENDIAN__
-#define LOAD_DATA(reg, off) \
-       lwz             reg,off(rWP);   /* load data                    */
-#define NEXT_BLOCK \
-       addi            rWP,rWP,64;     /* increment per block          */
-#else
-#define LOAD_DATA(reg, off) \
-       lwbrx           reg,0,rWP;      /* load data                    */ \
-       addi            rWP,rWP,4;      /* increment per word           */
-#define NEXT_BLOCK                     /* nothing to do                */
-#endif
-
-#define        R_00_15(a, b, c, d, e, w0, w1, k, off) \
-       LOAD_DATA(w0, off)              /* 1: W                         */ \
-       and             rT2,b,c;        /* 1: F' = B and C              */ \
-       LOAD_K##k##1                                                       \
-       andc            rT1,d,b;        /* 1: F" = ~B and D             */ \
-       rotrwi          rT0,a,27;       /* 1: A' = A rotl 5             */ \
-       or              rT2,rT2,rT1;    /* 1: F = F' or F"              */ \
-       add             e,e,rT0;        /* 1: E = E + A'                */ \
-       rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
-       add             e,e,w0;         /* 1: E = E + W                 */ \
-       LOAD_DATA(w1, off+4)            /* 2: W                         */ \
-       add             e,e,rT2;        /* 1: E = E + F                 */ \
-       and             rT1,a,b;        /* 2: F' = B and C              */ \
-       add             e,e,rK;         /* 1: E = E + K                 */ \
-       andc            rT2,c,a;        /* 2: F" = ~B and D             */ \
-       add             d,d,rK;         /* 2: E = E + K                 */ \
-       or              rT2,rT2,rT1;    /* 2: F = F' or F"              */ \
-       rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
-       add             d,d,w1;         /* 2: E = E + W                 */ \
-       rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
-       add             d,d,rT0;        /* 2: E = E + A'                */ \
-       evmergelo       w1,w1,w0;       /*    mix W[0]/W[1]             */ \
-       add             d,d,rT2         /* 2: E = E + F                 */
-
-#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
-       and             rT2,b,c;        /* 1: F' = B and C              */ \
-       evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
-       andc            rT1,d,b;        /* 1: F" = ~B and D             */ \
-       evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
-       or              rT1,rT1,rT2;    /* 1: F = F' or F"              */ \
-       evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
-       add             e,e,rT1;        /* 1: E = E + F                 */ \
-       evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
-       rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
-       evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
-       add             e,e,rT2;        /* 1: E = E + A'                */ \
-       evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
-       rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
-       LOAD_K##k##1                                                       \
-       evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
-       add             e,e,rT0;        /* 1: E = E + WK                */ \
-       add             d,d,rT1;        /* 2: E = E + WK                */ \
-       and             rT2,a,b;        /* 2: F' = B and C              */ \
-       andc            rT1,c,a;        /* 2: F" = ~B and D             */ \
-       rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
-       or              rT1,rT1,rT2;    /* 2: F = F' or F"              */ \
-       add             d,d,rT0;        /* 2: E = E + A'                */ \
-       rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
-       add             d,d,rT1         /* 2: E = E + F                 */
-
-#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
-       evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
-       xor             rT2,b,c;        /* 1: F' = B xor C              */ \
-       evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
-       xor             rT2,rT2,d;      /* 1: F = F' xor D              */ \
-       evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
-       add             e,e,rT2;        /* 1: E = E + F                 */ \
-       evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
-       rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
-       evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
-       add             e,e,rT2;        /* 1: E = E + A'                */ \
-       evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
-       rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
-       LOAD_K##k##1                                                       \
-       evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
-       add             e,e,rT0;        /* 1: E = E + WK                */ \
-       xor             rT2,a,b;        /* 2: F' = B xor C              */ \
-       add             d,d,rT1;        /* 2: E = E + WK                */ \
-       xor             rT2,rT2,c;      /* 2: F = F' xor D              */ \
-       rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
-       add             d,d,rT2;        /* 2: E = E + F                 */ \
-       rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
-       add             d,d,rT0         /* 2: E = E + A'                */
-
-#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
-       and             rT2,b,c;        /* 1: F' = B and C              */ \
-       evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
-       or              rT1,b,c;        /* 1: F" = B or C               */ \
-       evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
-       and             rT1,d,rT1;      /* 1: F" = F" and D             */ \
-       evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
-       or              rT2,rT2,rT1;    /* 1: F = F' or F"              */ \
-       evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
-       add             e,e,rT2;        /* 1: E = E + F                 */ \
-       evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
-       rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
-       evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
-       add             e,e,rT2;        /* 1: E = E + A'                */ \
-       LOAD_K##k##1                                                       \
-       evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
-       rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
-       add             e,e,rT0;        /* 1: E = E + WK                */ \
-       and             rT2,a,b;        /* 2: F' = B and C              */ \
-       or              rT0,a,b;        /* 2: F" = B or C               */ \
-       add             d,d,rT1;        /* 2: E = E + WK                */ \
-       and             rT0,c,rT0;      /* 2: F" = F" and D             */ \
-       rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
-       or              rT2,rT2,rT0;    /* 2: F = F' or F"              */ \
-       rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
-       add             d,d,rT2;        /* 2: E = E + F                 */ \
-       add             d,d,rT0         /* 2: E = E + A'                */
-
-#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
-       R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
-
-_GLOBAL(ppc_spe_sha1_transform)
-       INITIALIZE
-
-       lwz             rH0,0(rHP)
-       lwz             rH1,4(rHP)
-       mtctr           r5
-       lwz             rH2,8(rHP)
-       lis             rKP,PPC_SPE_SHA1_K@h
-       lwz             rH3,12(rHP)
-       ori             rKP,rKP,PPC_SPE_SHA1_K@l
-       lwz             rH4,16(rHP)
-
-ppc_spe_sha1_main:
-       R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
-       R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
-       R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
-       R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
-       R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
-       R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
-       R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
-       R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
-
-       R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
-       R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
-
-       R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
-       R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
-       R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
-       R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
-       R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
-       R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
-       R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
-       R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
-       R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
-       R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
-
-       R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
-       R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
-       R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
-       R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
-       R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
-       R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
-       R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
-       R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
-       R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
-       R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
-
-       R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
-       R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
-       R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
-       R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
-       R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
-       R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
-       R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
-       lwz             rT3,0(rHP)
-       R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
-       lwz             rW1,4(rHP)
-       R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
-       lwz             rW2,8(rHP)
-       R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
-       lwz             rW3,12(rHP)
-       NEXT_BLOCK
-       lwz             rW4,16(rHP)
-
-       add             rH0,rH0,rT3
-       stw             rH0,0(rHP)
-       add             rH1,rH1,rW1
-       stw             rH1,4(rHP)
-       add             rH2,rH2,rW2
-       stw             rH2,8(rHP)
-       add             rH3,rH3,rW3
-       stw             rH3,12(rHP)
-       add             rH4,rH4,rW4
-       stw             rH4,16(rHP)
-
-       bdnz            ppc_spe_sha1_main
-
-       FINALIZE
-       blr
-
-.data
-.align 4
-PPC_SPE_SHA1_K:
-       .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c
deleted file mode 100644 (file)
index 04c88e1..0000000
+++ /dev/null
@@ -1,107 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for SHA-1 implementation for SPE instructions (PPC)
- *
- * Based on generic implementation.
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <asm/switch_to.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/kernel.h>
-#include <linux/preempt.h>
-#include <linux/module.h>
-
-/*
- * MAX_BYTES defines the number of bytes that are allowed to be processed
- * between preempt_disable() and preempt_enable(). SHA1 takes ~1000
- * operations per 64 bytes. e500 cores can issue two arithmetic instructions
- * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
- * Thus 2KB of input data will need an estimated maximum of 18,000 cycles.
- * Headroom for cache misses included. Even with the low end model clocked
- * at 667 MHz this equals to a critical time window of less than 27us.
- *
- */
-#define MAX_BYTES 2048
-
-asmlinkage void ppc_spe_sha1_transform(u32 *state, const u8 *src, u32 blocks);
-
-static void spe_begin(void)
-{
-       /* We just start SPE operations and will save SPE registers later. */
-       preempt_disable();
-       enable_kernel_spe();
-}
-
-static void spe_end(void)
-{
-       disable_kernel_spe();
-       /* reenable preemption */
-       preempt_enable();
-}
-
-static void ppc_spe_sha1_block(struct sha1_state *sctx, const u8 *src,
-                              int blocks)
-{
-       do {
-               int unit = min(blocks, MAX_BYTES / SHA1_BLOCK_SIZE);
-
-               spe_begin();
-               ppc_spe_sha1_transform(sctx->state, src, unit);
-               spe_end();
-
-               src += unit * SHA1_BLOCK_SIZE;
-               blocks -= unit;
-       } while (blocks);
-}
-
-static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data,
-                       unsigned int len)
-{
-       return sha1_base_do_update_blocks(desc, data, len, ppc_spe_sha1_block);
-}
-
-static int ppc_spe_sha1_finup(struct shash_desc *desc, const u8 *src,
-                             unsigned int len, u8 *out)
-{
-       sha1_base_do_finup(desc, src, len, ppc_spe_sha1_block);
-       return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
-       .digestsize     =       SHA1_DIGEST_SIZE,
-       .init           =       sha1_base_init,
-       .update         =       ppc_spe_sha1_update,
-       .finup          =       ppc_spe_sha1_finup,
-       .descsize       =       SHA1_STATE_SIZE,
-       .base           =       {
-               .cra_name       =       "sha1",
-               .cra_driver_name=       "sha1-ppc-spe",
-               .cra_priority   =       300,
-               .cra_flags      =       CRYPTO_AHASH_ALG_BLOCK_ONLY,
-               .cra_blocksize  =       SHA1_BLOCK_SIZE,
-               .cra_module     =       THIS_MODULE,
-       }
-};
-
-static int __init ppc_spe_sha1_mod_init(void)
-{
-       return crypto_register_shash(&alg);
-}
-
-static void __exit ppc_spe_sha1_mod_fini(void)
-{
-       crypto_unregister_shash(&alg);
-}
-
-module_init(ppc_spe_sha1_mod_init);
-module_exit(ppc_spe_sha1_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, SPE optimized");
-
-MODULE_ALIAS_CRYPTO("sha1");
-MODULE_ALIAS_CRYPTO("sha1-ppc-spe");
diff --git a/arch/powerpc/crypto/sha1.c b/arch/powerpc/crypto/sha1.c
deleted file mode 100644 (file)
index 4593946..0000000
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- *
- * powerpc implementation of the SHA1 Secure Hash Algorithm.
- *
- * Derived from cryptoapi implementation, adapted for in-place
- * scatterlist interface.
- *
- * Derived from "crypto/sha1.c"
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- */
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void powerpc_sha_transform(u32 *state, const u8 *src);
-
-static void powerpc_sha_block(struct sha1_state *sctx, const u8 *data,
-                             int blocks)
-{
-       do {
-               powerpc_sha_transform(sctx->state, data);
-               data += 64;
-       } while (--blocks);
-}
-
-static int powerpc_sha1_update(struct shash_desc *desc, const u8 *data,
-                              unsigned int len)
-{
-       return sha1_base_do_update_blocks(desc, data, len, powerpc_sha_block);
-}
-
-/* Add padding and return the message digest. */
-static int powerpc_sha1_finup(struct shash_desc *desc, const u8 *src,
-                             unsigned int len, u8 *out)
-{
-       sha1_base_do_finup(desc, src, len, powerpc_sha_block);
-       return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
-       .digestsize     =       SHA1_DIGEST_SIZE,
-       .init           =       sha1_base_init,
-       .update         =       powerpc_sha1_update,
-       .finup          =       powerpc_sha1_finup,
-       .descsize       =       SHA1_STATE_SIZE,
-       .base           =       {
-               .cra_name       =       "sha1",
-               .cra_driver_name=       "sha1-powerpc",
-               .cra_flags      =       CRYPTO_AHASH_ALG_BLOCK_ONLY,
-               .cra_blocksize  =       SHA1_BLOCK_SIZE,
-               .cra_module     =       THIS_MODULE,
-       }
-};
-
-static int __init sha1_powerpc_mod_init(void)
-{
-       return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_powerpc_mod_fini(void)
-{
-       crypto_unregister_shash(&alg);
-}
-
-module_init(sha1_powerpc_mod_init);
-module_exit(sha1_powerpc_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm");
-
-MODULE_ALIAS_CRYPTO("sha1");
-MODULE_ALIAS_CRYPTO("sha1-powerpc");
index 7c9dc432fb4216fc4a110b324b8068791eed95d8..22ffbdab82d6ecd0be66434be88dbe6a2f9d933b 100644 (file)
@@ -149,6 +149,7 @@ config CRYPTO_LIB_SHA1_ARCH
        default y if ARM
        default y if ARM64 && KERNEL_MODE_NEON
        default y if MIPS && CPU_CAVIUM_OCTEON
+       default y if PPC
 
 config CRYPTO_LIB_SHA256
        tristate
index 1da13c9e2f711c4af74b3904767da2706860822d..02f672562928e1993fb2050fa4f02f69a3bdd892 100644 (file)
@@ -77,6 +77,10 @@ libsha1-$(CONFIG_KERNEL_MODE_NEON) += arm/sha1-armv7-neon.o \
                                      arm/sha1-ce-core.o
 endif
 libsha1-$(CONFIG_ARM64) += arm64/sha1-ce-core.o
+ifeq ($(CONFIG_PPC),y)
+libsha1-y += powerpc/sha1-powerpc-asm.o
+libsha1-$(CONFIG_SPE) += powerpc/sha1-spe-asm.o
+endif
 endif # CONFIG_CRYPTO_LIB_SHA1_ARCH
 
 ################################################################################
diff --git a/lib/crypto/powerpc/sha1-powerpc-asm.S b/lib/crypto/powerpc/sha1-powerpc-asm.S
new file mode 100644 (file)
index 0000000..f0d5ed5
--- /dev/null
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SHA-1 implementation for PowerPC.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+
+#ifdef __BIG_ENDIAN__
+#define LWZ(rt, d, ra) \
+       lwz     rt,d(ra)
+#else
+#define LWZ(rt, d, ra) \
+       li      rt,d;   \
+       lwbrx   rt,rt,ra
+#endif
+
+/*
+ * We roll the registers for T, A, B, C, D, E around on each
+ * iteration; T on iteration t is A on iteration t+1, and so on.
+ * We use registers 7 - 12 for this.
+ */
+#define RT(t)  ((((t)+5)%6)+7)
+#define RA(t)  ((((t)+4)%6)+7)
+#define RB(t)  ((((t)+3)%6)+7)
+#define RC(t)  ((((t)+2)%6)+7)
+#define RD(t)  ((((t)+1)%6)+7)
+#define RE(t)  ((((t)+0)%6)+7)
+
+/* We use registers 16 - 31 for the W values */
+#define W(t)   (((t)%16)+16)
+
+#define LOADW(t)                               \
+       LWZ(W(t),(t)*4,r4)
+
+#define STEPD0_LOAD(t)                         \
+       andc    r0,RD(t),RB(t);         \
+       and     r6,RB(t),RC(t);         \
+       rotlwi  RT(t),RA(t),5;                  \
+       or      r6,r6,r0;                       \
+       add     r0,RE(t),r15;                   \
+       add     RT(t),RT(t),r6;         \
+       add     r14,r0,W(t);                    \
+       LWZ(W((t)+4),((t)+4)*4,r4);     \
+       rotlwi  RB(t),RB(t),30;                 \
+       add     RT(t),RT(t),r14
+
+#define STEPD0_UPDATE(t)                       \
+       and     r6,RB(t),RC(t);         \
+       andc    r0,RD(t),RB(t);         \
+       rotlwi  RT(t),RA(t),5;                  \
+       rotlwi  RB(t),RB(t),30;                 \
+       or      r6,r6,r0;                       \
+       add     r0,RE(t),r15;                   \
+       xor     r5,W((t)+4-3),W((t)+4-8);               \
+       add     RT(t),RT(t),r6;         \
+       xor     W((t)+4),W((t)+4-16),W((t)+4-14);       \
+       add     r0,r0,W(t);                     \
+       xor     W((t)+4),W((t)+4),r5;                   \
+       add     RT(t),RT(t),r0;         \
+       rotlwi  W((t)+4),W((t)+4),1
+
+#define STEPD1(t)                              \
+       xor     r6,RB(t),RC(t);         \
+       rotlwi  RT(t),RA(t),5;                  \
+       rotlwi  RB(t),RB(t),30;                 \
+       xor     r6,r6,RD(t);                    \
+       add     r0,RE(t),r15;                   \
+       add     RT(t),RT(t),r6;         \
+       add     r0,r0,W(t);                     \
+       add     RT(t),RT(t),r0
+
+#define STEPD1_UPDATE(t)                               \
+       xor     r6,RB(t),RC(t);         \
+       rotlwi  RT(t),RA(t),5;                  \
+       rotlwi  RB(t),RB(t),30;                 \
+       xor     r6,r6,RD(t);                    \
+       add     r0,RE(t),r15;                   \
+       xor     r5,W((t)+4-3),W((t)+4-8);               \
+       add     RT(t),RT(t),r6;         \
+       xor     W((t)+4),W((t)+4-16),W((t)+4-14);       \
+       add     r0,r0,W(t);                     \
+       xor     W((t)+4),W((t)+4),r5;                   \
+       add     RT(t),RT(t),r0;         \
+       rotlwi  W((t)+4),W((t)+4),1
+
+#define STEPD2_UPDATE(t)                       \
+       and     r6,RB(t),RC(t);         \
+       and     r0,RB(t),RD(t);         \
+       rotlwi  RT(t),RA(t),5;                  \
+       or      r6,r6,r0;                       \
+       rotlwi  RB(t),RB(t),30;                 \
+       and     r0,RC(t),RD(t);         \
+       xor     r5,W((t)+4-3),W((t)+4-8);       \
+       or      r6,r6,r0;                       \
+       xor     W((t)+4),W((t)+4-16),W((t)+4-14);       \
+       add     r0,RE(t),r15;                   \
+       add     RT(t),RT(t),r6;         \
+       add     r0,r0,W(t);                     \
+       xor     W((t)+4),W((t)+4),r5;           \
+       add     RT(t),RT(t),r0;         \
+       rotlwi  W((t)+4),W((t)+4),1
+
+#define STEP0LD4(t)                            \
+       STEPD0_LOAD(t);                         \
+       STEPD0_LOAD((t)+1);                     \
+       STEPD0_LOAD((t)+2);                     \
+       STEPD0_LOAD((t)+3)
+
+#define STEPUP4(t, fn)                         \
+       STEP##fn##_UPDATE(t);                   \
+       STEP##fn##_UPDATE((t)+1);               \
+       STEP##fn##_UPDATE((t)+2);               \
+       STEP##fn##_UPDATE((t)+3)
+
+#define STEPUP20(t, fn)                                \
+       STEPUP4(t, fn);                         \
+       STEPUP4((t)+4, fn);                     \
+       STEPUP4((t)+8, fn);                     \
+       STEPUP4((t)+12, fn);                    \
+       STEPUP4((t)+16, fn)
+
+_GLOBAL(powerpc_sha_transform)
+       PPC_STLU r1,-INT_FRAME_SIZE(r1)
+       SAVE_GPRS(14, 31, r1)
+
+       /* Load up A - E */
+       lwz     RA(0),0(r3)     /* A */
+       lwz     RB(0),4(r3)     /* B */
+       lwz     RC(0),8(r3)     /* C */
+       lwz     RD(0),12(r3)    /* D */
+       lwz     RE(0),16(r3)    /* E */
+
+       LOADW(0)
+       LOADW(1)
+       LOADW(2)
+       LOADW(3)
+
+       lis     r15,0x5a82      /* K0-19 */
+       ori     r15,r15,0x7999
+       STEP0LD4(0)
+       STEP0LD4(4)
+       STEP0LD4(8)
+       STEPUP4(12, D0)
+       STEPUP4(16, D0)
+
+       lis     r15,0x6ed9      /* K20-39 */
+       ori     r15,r15,0xeba1
+       STEPUP20(20, D1)
+
+       lis     r15,0x8f1b      /* K40-59 */
+       ori     r15,r15,0xbcdc
+       STEPUP20(40, D2)
+
+       lis     r15,0xca62      /* K60-79 */
+       ori     r15,r15,0xc1d6
+       STEPUP4(60, D1)
+       STEPUP4(64, D1)
+       STEPUP4(68, D1)
+       STEPUP4(72, D1)
+       lwz     r20,16(r3)
+       STEPD1(76)
+       lwz     r19,12(r3)
+       STEPD1(77)
+       lwz     r18,8(r3)
+       STEPD1(78)
+       lwz     r17,4(r3)
+       STEPD1(79)
+
+       lwz     r16,0(r3)
+       add     r20,RE(80),r20
+       add     RD(0),RD(80),r19
+       add     RC(0),RC(80),r18
+       add     RB(0),RB(80),r17
+       add     RA(0),RA(80),r16
+       mr      RE(0),r20
+       stw     RA(0),0(r3)
+       stw     RB(0),4(r3)
+       stw     RC(0),8(r3)
+       stw     RD(0),12(r3)
+       stw     RE(0),16(r3)
+
+       REST_GPRS(14, 31, r1)
+       addi    r1,r1,INT_FRAME_SIZE
+       blr
diff --git a/lib/crypto/powerpc/sha1-spe-asm.S b/lib/crypto/powerpc/sha1-spe-asm.S
new file mode 100644 (file)
index 0000000..0f44752
--- /dev/null
@@ -0,0 +1,294 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast SHA-1 implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+#define rHP    r3      /* pointer to hash value                        */
+#define rWP    r4      /* pointer to input                             */
+#define rKP    r5      /* pointer to constants                         */
+
+#define rW0    r14     /* 64 bit round words                           */
+#define rW1    r15
+#define rW2    r16
+#define rW3    r17
+#define rW4    r18
+#define rW5    r19
+#define rW6    r20
+#define rW7    r21
+
+#define rH0    r6      /* 32 bit hash values                           */
+#define rH1    r7
+#define rH2    r8
+#define rH3    r9
+#define rH4    r10
+
+#define rT0    r22     /* 64 bit temporary                             */
+#define rT1    r0      /* 32 bit temporaries                           */
+#define rT2    r11
+#define rT3    r12
+
+#define rK     r23     /* 64 bit constant in volatile register         */
+
+#define LOAD_K01
+
+#define LOAD_K11 \
+       evlwwsplat      rK,0(rKP);
+
+#define LOAD_K21 \
+       evlwwsplat      rK,4(rKP);
+
+#define LOAD_K31 \
+       evlwwsplat      rK,8(rKP);
+
+#define LOAD_K41 \
+       evlwwsplat      rK,12(rKP);
+
+#define INITIALIZE \
+       stwu            r1,-128(r1);    /* create stack frame           */ \
+       evstdw          r14,8(r1);      /* We must save non volatile    */ \
+       evstdw          r15,16(r1);     /* registers. Take the chance   */ \
+       evstdw          r16,24(r1);     /* and save the SPE part too    */ \
+       evstdw          r17,32(r1);                                        \
+       evstdw          r18,40(r1);                                        \
+       evstdw          r19,48(r1);                                        \
+       evstdw          r20,56(r1);                                        \
+       evstdw          r21,64(r1);                                        \
+       evstdw          r22,72(r1);                                        \
+       evstdw          r23,80(r1);
+
+
+#define FINALIZE \
+       evldw           r14,8(r1);      /* restore SPE registers        */ \
+       evldw           r15,16(r1);                                        \
+       evldw           r16,24(r1);                                        \
+       evldw           r17,32(r1);                                        \
+       evldw           r18,40(r1);                                        \
+       evldw           r19,48(r1);                                        \
+       evldw           r20,56(r1);                                        \
+       evldw           r21,64(r1);                                        \
+       evldw           r22,72(r1);                                        \
+       evldw           r23,80(r1);                                        \
+       xor             r0,r0,r0;                                          \
+       stw             r0,8(r1);       /* Delete sensitive data        */ \
+       stw             r0,16(r1);      /* that we might have pushed    */ \
+       stw             r0,24(r1);      /* from other context that runs */ \
+       stw             r0,32(r1);      /* the same code. Assume that   */ \
+       stw             r0,40(r1);      /* the lower part of the GPRs   */ \
+       stw             r0,48(r1);      /* were already overwritten on  */ \
+       stw             r0,56(r1);      /* the way down to here         */ \
+       stw             r0,64(r1);                                         \
+       stw             r0,72(r1);                                         \
+       stw             r0,80(r1);                                         \
+       addi            r1,r1,128;      /* cleanup stack frame          */
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+       lwz             reg,off(rWP);   /* load data                    */
+#define NEXT_BLOCK \
+       addi            rWP,rWP,64;     /* increment per block          */
+#else
+#define LOAD_DATA(reg, off) \
+       lwbrx           reg,0,rWP;      /* load data                    */ \
+       addi            rWP,rWP,4;      /* increment per word           */
+#define NEXT_BLOCK                     /* nothing to do                */
+#endif
+
+#define        R_00_15(a, b, c, d, e, w0, w1, k, off) \
+       LOAD_DATA(w0, off)              /* 1: W                         */ \
+       and             rT2,b,c;        /* 1: F' = B and C              */ \
+       LOAD_K##k##1                                                       \
+       andc            rT1,d,b;        /* 1: F" = ~B and D             */ \
+       rotrwi          rT0,a,27;       /* 1: A' = A rotl 5             */ \
+       or              rT2,rT2,rT1;    /* 1: F = F' or F"              */ \
+       add             e,e,rT0;        /* 1: E = E + A'                */ \
+       rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
+       add             e,e,w0;         /* 1: E = E + W                 */ \
+       LOAD_DATA(w1, off+4)            /* 2: W                         */ \
+       add             e,e,rT2;        /* 1: E = E + F                 */ \
+       and             rT1,a,b;        /* 2: F' = B and C              */ \
+       add             e,e,rK;         /* 1: E = E + K                 */ \
+       andc            rT2,c,a;        /* 2: F" = ~B and D             */ \
+       add             d,d,rK;         /* 2: E = E + K                 */ \
+       or              rT2,rT2,rT1;    /* 2: F = F' or F"              */ \
+       rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
+       add             d,d,w1;         /* 2: E = E + W                 */ \
+       rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
+       add             d,d,rT0;        /* 2: E = E + A'                */ \
+       evmergelo       w1,w1,w0;       /*    mix W[0]/W[1]             */ \
+       add             d,d,rT2         /* 2: E = E + F                 */
+
+#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+       and             rT2,b,c;        /* 1: F' = B and C              */ \
+       evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
+       andc            rT1,d,b;        /* 1: F" = ~B and D             */ \
+       evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
+       or              rT1,rT1,rT2;    /* 1: F = F' or F"              */ \
+       evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
+       add             e,e,rT1;        /* 1: E = E + F                 */ \
+       evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
+       rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
+       evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
+       add             e,e,rT2;        /* 1: E = E + A'                */ \
+       evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
+       rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
+       LOAD_K##k##1                                                       \
+       evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
+       add             e,e,rT0;        /* 1: E = E + WK                */ \
+       add             d,d,rT1;        /* 2: E = E + WK                */ \
+       and             rT2,a,b;        /* 2: F' = B and C              */ \
+       andc            rT1,c,a;        /* 2: F" = ~B and D             */ \
+       rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
+       or              rT1,rT1,rT2;    /* 2: F = F' or F"              */ \
+       add             d,d,rT0;        /* 2: E = E + A'                */ \
+       rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
+       add             d,d,rT1         /* 2: E = E + F                 */
+
+#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+       evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
+       xor             rT2,b,c;        /* 1: F' = B xor C              */ \
+       evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
+       xor             rT2,rT2,d;      /* 1: F = F' xor D              */ \
+       evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
+       add             e,e,rT2;        /* 1: E = E + F                 */ \
+       evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
+       rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
+       evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
+       add             e,e,rT2;        /* 1: E = E + A'                */ \
+       evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
+       rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
+       LOAD_K##k##1                                                       \
+       evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
+       add             e,e,rT0;        /* 1: E = E + WK                */ \
+       xor             rT2,a,b;        /* 2: F' = B xor C              */ \
+       add             d,d,rT1;        /* 2: E = E + WK                */ \
+       xor             rT2,rT2,c;      /* 2: F = F' xor D              */ \
+       rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
+       add             d,d,rT2;        /* 2: E = E + F                 */ \
+       rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
+       add             d,d,rT0         /* 2: E = E + A'                */
+
+#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+       and             rT2,b,c;        /* 1: F' = B and C              */ \
+       evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
+       or              rT1,b,c;        /* 1: F" = B or C               */ \
+       evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
+       and             rT1,d,rT1;      /* 1: F" = F" and D             */ \
+       evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
+       or              rT2,rT2,rT1;    /* 1: F = F' or F"              */ \
+       evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
+       add             e,e,rT2;        /* 1: E = E + F                 */ \
+       evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
+       rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
+       evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
+       add             e,e,rT2;        /* 1: E = E + A'                */ \
+       LOAD_K##k##1                                                       \
+       evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
+       rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
+       add             e,e,rT0;        /* 1: E = E + WK                */ \
+       and             rT2,a,b;        /* 2: F' = B and C              */ \
+       or              rT0,a,b;        /* 2: F" = B or C               */ \
+       add             d,d,rT1;        /* 2: E = E + WK                */ \
+       and             rT0,c,rT0;      /* 2: F" = F" and D             */ \
+       rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
+       or              rT2,rT2,rT0;    /* 2: F = F' or F"              */ \
+       rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
+       add             d,d,rT2;        /* 2: E = E + F                 */ \
+       add             d,d,rT0         /* 2: E = E + A'                */
+
+#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+       R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
+
+_GLOBAL(ppc_spe_sha1_transform)
+       INITIALIZE
+
+       lwz             rH0,0(rHP)
+       lwz             rH1,4(rHP)
+       mtctr           r5
+       lwz             rH2,8(rHP)
+       lis             rKP,PPC_SPE_SHA1_K@h
+       lwz             rH3,12(rHP)
+       ori             rKP,rKP,PPC_SPE_SHA1_K@l
+       lwz             rH4,16(rHP)
+
+ppc_spe_sha1_main:
+       R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
+       R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
+       R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
+       R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
+       R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
+       R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
+       R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
+       R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
+
+       R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
+       R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
+
+       R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
+       R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
+       R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
+       R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
+       R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
+       R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
+       R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
+       R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
+       R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
+       R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
+
+       R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
+       R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
+       R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
+       R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
+       R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
+       R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
+       R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
+       R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
+       R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
+       R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
+
+       R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
+       R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
+       R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
+       R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
+       R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
+       R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
+       R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
+       lwz             rT3,0(rHP)
+       R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
+       lwz             rW1,4(rHP)
+       R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
+       lwz             rW2,8(rHP)
+       R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
+       lwz             rW3,12(rHP)
+       NEXT_BLOCK
+       lwz             rW4,16(rHP)
+
+       add             rH0,rH0,rT3
+       stw             rH0,0(rHP)
+       add             rH1,rH1,rW1
+       stw             rH1,4(rHP)
+       add             rH2,rH2,rW2
+       stw             rH2,8(rHP)
+       add             rH3,rH3,rW3
+       stw             rH3,12(rHP)
+       add             rH4,rH4,rW4
+       stw             rH4,16(rHP)
+
+       bdnz            ppc_spe_sha1_main
+
+       FINALIZE
+       blr
+
+.data
+.align 4
+PPC_SPE_SHA1_K:
+       .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
diff --git a/lib/crypto/powerpc/sha1.h b/lib/crypto/powerpc/sha1.h
new file mode 100644 (file)
index 0000000..e2c010f
--- /dev/null
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for PowerPC
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/switch_to.h>
+#include <linux/preempt.h>
+
+#ifdef CONFIG_SPE
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA1 takes ~1000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 2KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 2048
+
+asmlinkage void ppc_spe_sha1_transform(struct sha1_block_state *state,
+                                      const u8 *data, u32 nblocks);
+
+static void spe_begin(void)
+{
+       /* We just start SPE operations and will save SPE registers later. */
+       preempt_disable();
+       enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+       disable_kernel_spe();
+       /* reenable preemption */
+       preempt_enable();
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+                       const u8 *data, size_t nblocks)
+{
+       do {
+               u32 unit = min_t(size_t, nblocks, MAX_BYTES / SHA1_BLOCK_SIZE);
+
+               spe_begin();
+               ppc_spe_sha1_transform(state, data, unit);
+               spe_end();
+
+               data += unit * SHA1_BLOCK_SIZE;
+               nblocks -= unit;
+       } while (nblocks);
+}
+#else /* CONFIG_SPE */
+asmlinkage void powerpc_sha_transform(struct sha1_block_state *state,
+                                     const u8 data[SHA1_BLOCK_SIZE]);
+
+static void sha1_blocks(struct sha1_block_state *state,
+                       const u8 *data, size_t nblocks)
+{
+       do {
+               powerpc_sha_transform(state, data);
+               data += SHA1_BLOCK_SIZE;
+       } while (--nblocks);
+}
+#endif /* !CONFIG_SPE */