From: Eric Biggers <ebiggers@google.com>
Date: Mon, 28 Apr 2025 17:00:31 +0000 (-0700)
Subject: crypto: powerpc/sha256 - implement library instead of shash
X-Git-Tag: block-6.16-20250606~34^2~117
X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=1a49c573bf886349c7bc958b331c700cf18601d8;p=linux-block.git

crypto: powerpc/sha256 - implement library instead of shash

Instead of providing crypto_shash algorithms for the arch-optimized
SHA-256 code, instead implement the SHA-256 library.  This is much
simpler, it makes the SHA-256 library functions be arch-optimized, and
it fixes the longstanding issue where the arch-optimized SHA-256 was
disabled by default.  SHA-256 still remains available through
crypto_shash, but individual architectures no longer need to handle it.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 4bf7b01228e7..caaa359f4742 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -39,17 +39,6 @@ config CRYPTO_SHA1_PPC_SPE
 	  Architecture: powerpc using
 	  - SPE (Signal Processing Engine) extensions
 
-config CRYPTO_SHA256_PPC_SPE
-	tristate "Hash functions: SHA-224 and SHA-256 (SPE)"
-	depends on SPE
-	select CRYPTO_SHA256
-	select CRYPTO_HASH
-	help
-	  SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-	  Architecture: powerpc using
-	  - SPE (Signal Processing Engine) extensions
-
 config CRYPTO_AES_PPC_SPE
 	tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)"
 	depends on SPE
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index f13aec8a1833..8c2936ae466f 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -9,7 +9,6 @@ obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o
 obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o
 obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o
 obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o
-obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
 obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
 obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
 obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o
@@ -18,7 +17,6 @@ aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-
 md5-ppc-y := md5-asm.o md5-glue.o
 sha1-powerpc-y := sha1-powerpc-asm.o sha1.o
 sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
-sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
 aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o
 vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o
 curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o
diff --git a/arch/powerpc/crypto/sha256-spe-asm.S b/arch/powerpc/crypto/sha256-spe-asm.S
deleted file mode 100644
index cd99d71dae34..000000000000
--- a/arch/powerpc/crypto/sha256-spe-asm.S
+++ /dev/null
@@ -1,318 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Fast SHA-256 implementation for SPE instruction set (PPC)
- *
- * This code makes use of the SPE SIMD instruction set as defined in
- * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
- * Implementation is based on optimization guide notes from
- * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-
-#define rHP	r3	/* pointer to hash values in memory		*/
-#define rKP	r24	/* pointer to round constants			*/
-#define rWP	r4	/* pointer to input data			*/
-
-#define rH0	r5	/* 8 32 bit hash values in 8 registers		*/
-#define rH1	r6
-#define rH2	r7
-#define rH3	r8
-#define rH4	r9
-#define rH5	r10
-#define rH6	r11
-#define rH7	r12
-
-#define rW0	r14	/* 64 bit registers. 16 words in 8 registers	*/
-#define rW1	r15
-#define rW2	r16
-#define rW3	r17
-#define rW4	r18
-#define rW5	r19
-#define rW6	r20
-#define rW7	r21
-
-#define rT0	r22	/* 64 bit temporaries 				*/
-#define rT1	r23
-#define rT2	r0	/* 32 bit temporaries				*/
-#define rT3	r25
-
-#define CMP_KN_LOOP
-#define CMP_KC_LOOP \
-	cmpwi		rT1,0;
-
-#define INITIALIZE \
-	stwu		r1,-128(r1);	/* create stack frame		*/ \
-	evstdw		r14,8(r1);	/* We must save non volatile	*/ \
-	evstdw		r15,16(r1);	/* registers. Take the chance	*/ \
-	evstdw		r16,24(r1);	/* and save the SPE part too	*/ \
-	evstdw		r17,32(r1);					   \
-	evstdw		r18,40(r1);					   \
-	evstdw		r19,48(r1);					   \
-	evstdw		r20,56(r1);					   \
-	evstdw		r21,64(r1);					   \
-	evstdw		r22,72(r1);					   \
-	evstdw		r23,80(r1);					   \
-	stw		r24,88(r1);	/* save normal registers	*/ \
-	stw		r25,92(r1);
-
-
-#define FINALIZE \
-	evldw		r14,8(r1);	/* restore SPE registers	*/ \
-	evldw		r15,16(r1);					   \
-	evldw		r16,24(r1);					   \
-	evldw		r17,32(r1);					   \
-	evldw		r18,40(r1);					   \
-	evldw		r19,48(r1);					   \
-	evldw		r20,56(r1);					   \
-	evldw		r21,64(r1);					   \
-	evldw		r22,72(r1);					   \
-	evldw		r23,80(r1);					   \
-	lwz		r24,88(r1);	/* restore normal registers	*/ \
-	lwz		r25,92(r1);					   \
-	xor		r0,r0,r0;					   \
-	stw		r0,8(r1);	/* Delete sensitive data	*/ \
-	stw		r0,16(r1);	/* that we might have pushed	*/ \
-	stw		r0,24(r1);	/* from other context that runs	*/ \
-	stw		r0,32(r1);	/* the same code. Assume that	*/ \
-	stw		r0,40(r1);	/* the lower part of the GPRs	*/ \
-	stw		r0,48(r1);	/* was already overwritten on	*/ \
-	stw		r0,56(r1);	/* the way down to here		*/ \
-	stw		r0,64(r1);					   \
-	stw		r0,72(r1);					   \
-	stw		r0,80(r1);					   \
-	addi		r1,r1,128;	/* cleanup stack frame		*/
-
-#ifdef __BIG_ENDIAN__
-#define LOAD_DATA(reg, off) \
-	lwz		reg,off(rWP);	/* load data			*/
-#define NEXT_BLOCK \
-	addi		rWP,rWP,64;	/* increment per block		*/
-#else
-#define LOAD_DATA(reg, off) \
-	lwbrx		reg,0,rWP; 	/* load data			*/ \
-	addi		rWP,rWP,4;	/* increment per word		*/
-#define NEXT_BLOCK			/* nothing to do		*/
-#endif
-
-#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
-	LOAD_DATA(w, off)		/* 1: W				*/ \
-	rotrwi		rT0,e,6;	/* 1: S1 = e rotr 6		*/ \
-	rotrwi		rT1,e,11;	/* 1: S1' = e rotr 11		*/ \
-	rotrwi		rT2,e,25;	/* 1: S1" = e rotr 25		*/ \
-	xor		rT0,rT0,rT1;	/* 1: S1 = S1 xor S1'		*/ \
-	and		rT3,e,f;	/* 1: ch = e and f		*/ \
-	xor		rT0,rT0,rT2;	/* 1: S1 = S1 xor S1"		*/ \
-	andc		rT1,g,e;	/* 1: ch' = ~e and g		*/ \
-	lwz		rT2,off(rKP);	/* 1: K				*/ \
-	xor		rT3,rT3,rT1;	/* 1: ch = ch xor ch'		*/ \
-	add		h,h,rT0;	/* 1: temp1 = h + S1		*/ \
-	add		rT3,rT3,w;	/* 1: temp1' = ch + w		*/ \
-	rotrwi		rT0,a,2;	/* 1: S0 = a rotr 2		*/ \
-	add		h,h,rT3;	/* 1: temp1 = temp1 + temp1'	*/ \
-	rotrwi		rT1,a,13;	/* 1: S0' = a rotr 13		*/ \
-	add		h,h,rT2;	/* 1: temp1 = temp1 + K		*/ \
-	rotrwi		rT3,a,22;	/* 1: S0" = a rotr 22		*/ \
-	xor		rT0,rT0,rT1;	/* 1: S0 = S0 xor S0'		*/ \
-	add		d,d,h;		/* 1: d = d + temp1		*/ \
-	xor		rT3,rT0,rT3;	/* 1: S0 = S0 xor S0"		*/ \
-	evmergelo	w,w,w;		/*    shift W			*/ \
-	or		rT2,a,b;	/* 1: maj = a or b		*/ \
-	and		rT1,a,b;	/* 1: maj' = a and b		*/ \
-	and		rT2,rT2,c;	/* 1: maj = maj and c		*/ \
-	LOAD_DATA(w, off+4)		/* 2: W				*/ \
-	or		rT2,rT1,rT2;	/* 1: maj = maj or maj'		*/ \
-	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
-	add		rT3,rT3,rT2;	/* 1: temp2 = S0 + maj		*/ \
-	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
-	add		h,h,rT3;	/* 1: h = temp1 + temp2		*/ \
-	rotrwi		rT2,d,25;	/* 2: S1" = e rotr 25		*/ \
-	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
-	and		rT3,d,e;	/* 2: ch = e and f		*/ \
-	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
-	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
-	lwz		rT2,off+4(rKP);	/* 2: K				*/ \
-	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
-	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
-	add		rT3,rT3,w;	/* 2: temp1' = ch + w		*/ \
-	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
-	add		g,g,rT3;	/* 2: temp1 = temp1 + temp1'	*/ \
-	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
-	add		g,g,rT2;	/* 2: temp1 = temp1 + K		*/ \
-	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
-	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
-	or		rT2,h,a;	/* 2: maj = a or b		*/ \
-	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
-	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
-	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
-	add		c,c,g;		/* 2: d = d + temp1		*/ \
-	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
-	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
-	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
-
-#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
-	rotrwi		rT2,e,6;	/* 1: S1 = e rotr 6		*/ \
-	evmergelohi	rT0,w0,w1;	/*    w[-15]			*/ \
-	rotrwi		rT3,e,11;	/* 1: S1' = e rotr 11		*/ \
-	evsrwiu		rT1,rT0,3;	/*    s0 = w[-15] >> 3		*/ \
-	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
-	evrlwi		rT0,rT0,25;	/*    s0' = w[-15] rotr	7	*/ \
-	rotrwi		rT3,e,25;	/* 1: S1' = e rotr 25		*/ \
-	evxor		rT1,rT1,rT0;	/*    s0 = s0 xor s0'		*/ \
-	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
-	evrlwi		rT0,rT0,21;	/*    s0' = w[-15] rotr 18	*/ \
-	add		h,h,rT2;	/* 1: temp1 = h + S1		*/ \
-	evxor		rT0,rT0,rT1;	/*    s0 = s0 xor s0'		*/ \
-	and		rT2,e,f;	/* 1: ch = e and f		*/ \
-	evaddw		w0,w0,rT0;	/*    w = w[-16] + s0		*/ \
-	andc		rT3,g,e;	/* 1: ch' = ~e and g		*/ \
-	evsrwiu		rT0,w7,10;	/*    s1 = w[-2] >> 10		*/ \
-	xor		rT2,rT2,rT3;	/* 1: ch = ch xor ch'		*/ \
-	evrlwi		rT1,w7,15;	/*    s1' = w[-2] rotr 17	*/ \
-	add		h,h,rT2;	/* 1: temp1 = temp1 + ch	*/ \
-	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
-	rotrwi		rT2,a,2;	/* 1: S0 = a rotr 2		*/ \
-	evrlwi		rT1,w7,13;	/*    s1' = w[-2] rotr 19	*/ \
-	rotrwi		rT3,a,13;	/* 1: S0' = a rotr 13		*/ \
-	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
-	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
-	evldw		rT1,off(rKP);	/*    k				*/ \
-	rotrwi		rT3,a,22;	/* 1: S0' = a rotr 22		*/ \
-	evaddw		w0,w0,rT0;	/*    w = w + s1		*/ \
-	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
-	evmergelohi	rT0,w4,w5;	/*    w[-7]			*/ \
-	and		rT3,a,b;	/* 1: maj = a and b		*/ \
-	evaddw		w0,w0,rT0;	/*    w = w + w[-7]		*/ \
-	CMP_K##k##_LOOP							   \
-	add		rT2,rT2,rT3;	/* 1: temp2 = S0 + maj		*/ \
-	evaddw		rT1,rT1,w0;	/*    wk = w + k		*/ \
-	xor		rT3,a,b;	/* 1: maj = a xor b		*/ \
-	evmergehi	rT0,rT1,rT1;	/*    wk1/wk2			*/ \
-	and		rT3,rT3,c;	/* 1: maj = maj and c		*/ \
-	add		h,h,rT0;	/* 1: temp1 = temp1 + wk	*/ \
-	add		rT2,rT2,rT3;	/* 1: temp2 = temp2 + maj	*/ \
-	add		g,g,rT1;	/* 2: temp1 = temp1 + wk	*/ \
-	add		d,d,h;		/* 1: d = d + temp1		*/ \
-	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
-	add		h,h,rT2;	/* 1: h = temp1 + temp2		*/ \
-	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
-	rotrwi		rT2,d,25;	/* 2: S" = e rotr 25		*/ \
-	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
-	and		rT3,d,e;	/* 2: ch = e and f		*/ \
-	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
-	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
-	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
-	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
-	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
-	add		g,g,rT3;	/* 2: temp1 = temp1 + ch	*/ \
-	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
-	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
-	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
-	or		rT2,h,a;	/* 2: maj = a or b		*/ \
-	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
-	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
-	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
-	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
-	add		c,c,g;		/* 2: d = d + temp1		*/ \
-	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
-	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
-
-_GLOBAL(ppc_spe_sha256_transform)
-	INITIALIZE
-
-	mtctr		r5
-	lwz		rH0,0(rHP)
-	lwz		rH1,4(rHP)
-	lwz		rH2,8(rHP)
-	lwz		rH3,12(rHP)
-	lwz		rH4,16(rHP)
-	lwz		rH5,20(rHP)
-	lwz		rH6,24(rHP)
-	lwz		rH7,28(rHP)
-
-ppc_spe_sha256_main:
-	lis		rKP,PPC_SPE_SHA256_K@ha
-	addi		rKP,rKP,PPC_SPE_SHA256_K@l
-
-	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
-	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
-	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
-	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
-	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
-	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
-	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
-	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
-ppc_spe_sha256_16_rounds:
-	addi		rKP,rKP,64
-	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
-		 rW0, rW1, rW4, rW5, rW7, N, 0)
-	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
-		 rW1, rW2, rW5, rW6, rW0, N, 8)
-	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
-		 rW2, rW3, rW6, rW7, rW1, N, 16)
-	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
-		 rW3, rW4, rW7, rW0, rW2, N, 24)
-	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
-		 rW4, rW5, rW0, rW1, rW3, N, 32)
-	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
-		 rW5, rW6, rW1, rW2, rW4, N, 40)
-	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
-		 rW6, rW7, rW2, rW3, rW5, N, 48)
-	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
-		 rW7, rW0, rW3, rW4, rW6, C, 56)
-	bt		gt,ppc_spe_sha256_16_rounds
-
-	lwz		rW0,0(rHP)
-	NEXT_BLOCK
-	lwz		rW1,4(rHP)
-	lwz		rW2,8(rHP)
-	lwz		rW3,12(rHP)
-	lwz		rW4,16(rHP)
-	lwz		rW5,20(rHP)
-	lwz		rW6,24(rHP)
-	lwz		rW7,28(rHP)
-
-	add		rH0,rH0,rW0
-	stw		rH0,0(rHP)
-	add		rH1,rH1,rW1
-	stw		rH1,4(rHP)
-	add		rH2,rH2,rW2
-	stw		rH2,8(rHP)
-	add		rH3,rH3,rW3
-	stw		rH3,12(rHP)
-	add		rH4,rH4,rW4
-	stw		rH4,16(rHP)
-	add		rH5,rH5,rW5
-	stw		rH5,20(rHP)
-	add		rH6,rH6,rW6
-	stw		rH6,24(rHP)
-	add		rH7,rH7,rW7
-	stw		rH7,28(rHP)
-
-	bdnz		ppc_spe_sha256_main
-
-	FINALIZE
-	blr
-
-.data
-.align 5
-PPC_SPE_SHA256_K:
-	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/arch/powerpc/crypto/sha256-spe-glue.c b/arch/powerpc/crypto/sha256-spe-glue.c
deleted file mode 100644
index 42c76bf8062d..000000000000
--- a/arch/powerpc/crypto/sha256-spe-glue.c
+++ /dev/null
@@ -1,128 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for SHA-256 implementation for SPE instructions (PPC)
- *
- * Based on generic implementation. The assembler module takes care 
- * about the SPE registers so it can run from interrupt context.
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <asm/switch_to.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/preempt.h>
-
-/*
- * MAX_BYTES defines the number of bytes that are allowed to be processed
- * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
- * operations per 64 bytes. e500 cores can issue two arithmetic instructions
- * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
- * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
- * Headroom for cache misses included. Even with the low end model clocked
- * at 667 MHz this equals to a critical time window of less than 27us.
- *
- */
-#define MAX_BYTES 1024
-
-extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks);
-
-static void spe_begin(void)
-{
-	/* We just start SPE operations and will save SPE registers later. */
-	preempt_disable();
-	enable_kernel_spe();
-}
-
-static void spe_end(void)
-{
-	disable_kernel_spe();
-	/* reenable preemption */
-	preempt_enable();
-}
-
-static void ppc_spe_sha256_block(struct crypto_sha256_state *sctx,
-				 const u8 *src, int blocks)
-{
-	do {
-		/* cut input data into smaller blocks */
-		int unit = min(blocks, MAX_BYTES / SHA256_BLOCK_SIZE);
-
-		spe_begin();
-		ppc_spe_sha256_transform(sctx->state, src, unit);
-		spe_end();
-
-		src += unit * SHA256_BLOCK_SIZE;
-		blocks -= unit;
-	} while (blocks);
-}
-
-static int ppc_spe_sha256_update(struct shash_desc *desc, const u8 *data,
-			unsigned int len)
-{
-	return sha256_base_do_update_blocks(desc, data, len,
-					    ppc_spe_sha256_block);
-}
-
-static int ppc_spe_sha256_finup(struct shash_desc *desc, const u8 *src,
-				unsigned int len, u8 *out)
-{
-	sha256_base_do_finup(desc, src, len, ppc_spe_sha256_block);
-	return sha256_base_finish(desc, out);
-}
-
-static struct shash_alg algs[2] = { {
-	.digestsize	=	SHA256_DIGEST_SIZE,
-	.init		=	sha256_base_init,
-	.update		=	ppc_spe_sha256_update,
-	.finup		=	ppc_spe_sha256_finup,
-	.descsize	=	sizeof(struct crypto_sha256_state),
-	.base		=	{
-		.cra_name	=	"sha256",
-		.cra_driver_name=	"sha256-ppc-spe",
-		.cra_priority	=	300,
-		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
-					CRYPTO_AHASH_ALG_FINUP_MAX,
-		.cra_blocksize	=	SHA256_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-}, {
-	.digestsize	=	SHA224_DIGEST_SIZE,
-	.init		=	sha224_base_init,
-	.update		=	ppc_spe_sha256_update,
-	.finup		=	ppc_spe_sha256_finup,
-	.descsize	=	sizeof(struct crypto_sha256_state),
-	.base		=	{
-		.cra_name	=	"sha224",
-		.cra_driver_name=	"sha224-ppc-spe",
-		.cra_priority	=	300,
-		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
-					CRYPTO_AHASH_ALG_FINUP_MAX,
-		.cra_blocksize	=	SHA224_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-} };
-
-static int __init ppc_spe_sha256_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit ppc_spe_sha256_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_init(ppc_spe_sha256_mod_init);
-module_exit(ppc_spe_sha256_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm, SPE optimized");
-
-MODULE_ALIAS_CRYPTO("sha224");
-MODULE_ALIAS_CRYPTO("sha224-ppc-spe");
-MODULE_ALIAS_CRYPTO("sha256");
-MODULE_ALIAS_CRYPTO("sha256-ppc-spe");
diff --git a/arch/powerpc/lib/crypto/Kconfig b/arch/powerpc/lib/crypto/Kconfig
index bf6d0ab22c27..ffa541ad6d5d 100644
--- a/arch/powerpc/lib/crypto/Kconfig
+++ b/arch/powerpc/lib/crypto/Kconfig
@@ -13,3 +13,9 @@ config CRYPTO_POLY1305_P10
 	default CRYPTO_LIB_POLY1305
 	select CRYPTO_ARCH_HAVE_LIB_POLY1305
 	select CRYPTO_LIB_POLY1305_GENERIC
+
+config CRYPTO_SHA256_PPC_SPE
+	tristate
+	depends on SPE
+	default CRYPTO_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256
diff --git a/arch/powerpc/lib/crypto/Makefile b/arch/powerpc/lib/crypto/Makefile
index 5709ae14258a..27f231f8e334 100644
--- a/arch/powerpc/lib/crypto/Makefile
+++ b/arch/powerpc/lib/crypto/Makefile
@@ -5,3 +5,6 @@ chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
 
 obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
 poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
+
+obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
+sha256-ppc-spe-y := sha256.o sha256-spe-asm.o
diff --git a/arch/powerpc/lib/crypto/sha256-spe-asm.S b/arch/powerpc/lib/crypto/sha256-spe-asm.S
new file mode 100644
index 000000000000..cd99d71dae34
--- /dev/null
+++ b/arch/powerpc/lib/crypto/sha256-spe-asm.S
@@ -0,0 +1,318 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast SHA-256 implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+#define rHP	r3	/* pointer to hash values in memory		*/
+#define rKP	r24	/* pointer to round constants			*/
+#define rWP	r4	/* pointer to input data			*/
+
+#define rH0	r5	/* 8 32 bit hash values in 8 registers		*/
+#define rH1	r6
+#define rH2	r7
+#define rH3	r8
+#define rH4	r9
+#define rH5	r10
+#define rH6	r11
+#define rH7	r12
+
+#define rW0	r14	/* 64 bit registers. 16 words in 8 registers	*/
+#define rW1	r15
+#define rW2	r16
+#define rW3	r17
+#define rW4	r18
+#define rW5	r19
+#define rW6	r20
+#define rW7	r21
+
+#define rT0	r22	/* 64 bit temporaries 				*/
+#define rT1	r23
+#define rT2	r0	/* 32 bit temporaries				*/
+#define rT3	r25
+
+#define CMP_KN_LOOP
+#define CMP_KC_LOOP \
+	cmpwi		rT1,0;
+
+#define INITIALIZE \
+	stwu		r1,-128(r1);	/* create stack frame		*/ \
+	evstdw		r14,8(r1);	/* We must save non volatile	*/ \
+	evstdw		r15,16(r1);	/* registers. Take the chance	*/ \
+	evstdw		r16,24(r1);	/* and save the SPE part too	*/ \
+	evstdw		r17,32(r1);					   \
+	evstdw		r18,40(r1);					   \
+	evstdw		r19,48(r1);					   \
+	evstdw		r20,56(r1);					   \
+	evstdw		r21,64(r1);					   \
+	evstdw		r22,72(r1);					   \
+	evstdw		r23,80(r1);					   \
+	stw		r24,88(r1);	/* save normal registers	*/ \
+	stw		r25,92(r1);
+
+
+#define FINALIZE \
+	evldw		r14,8(r1);	/* restore SPE registers	*/ \
+	evldw		r15,16(r1);					   \
+	evldw		r16,24(r1);					   \
+	evldw		r17,32(r1);					   \
+	evldw		r18,40(r1);					   \
+	evldw		r19,48(r1);					   \
+	evldw		r20,56(r1);					   \
+	evldw		r21,64(r1);					   \
+	evldw		r22,72(r1);					   \
+	evldw		r23,80(r1);					   \
+	lwz		r24,88(r1);	/* restore normal registers	*/ \
+	lwz		r25,92(r1);					   \
+	xor		r0,r0,r0;					   \
+	stw		r0,8(r1);	/* Delete sensitive data	*/ \
+	stw		r0,16(r1);	/* that we might have pushed	*/ \
+	stw		r0,24(r1);	/* from other context that runs	*/ \
+	stw		r0,32(r1);	/* the same code. Assume that	*/ \
+	stw		r0,40(r1);	/* the lower part of the GPRs	*/ \
+	stw		r0,48(r1);	/* was already overwritten on	*/ \
+	stw		r0,56(r1);	/* the way down to here		*/ \
+	stw		r0,64(r1);					   \
+	stw		r0,72(r1);					   \
+	stw		r0,80(r1);					   \
+	addi		r1,r1,128;	/* cleanup stack frame		*/
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+	lwz		reg,off(rWP);	/* load data			*/
+#define NEXT_BLOCK \
+	addi		rWP,rWP,64;	/* increment per block		*/
+#else
+#define LOAD_DATA(reg, off) \
+	lwbrx		reg,0,rWP; 	/* load data			*/ \
+	addi		rWP,rWP,4;	/* increment per word		*/
+#define NEXT_BLOCK			/* nothing to do		*/
+#endif
+
+#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
+	LOAD_DATA(w, off)		/* 1: W				*/ \
+	rotrwi		rT0,e,6;	/* 1: S1 = e rotr 6		*/ \
+	rotrwi		rT1,e,11;	/* 1: S1' = e rotr 11		*/ \
+	rotrwi		rT2,e,25;	/* 1: S1" = e rotr 25		*/ \
+	xor		rT0,rT0,rT1;	/* 1: S1 = S1 xor S1'		*/ \
+	and		rT3,e,f;	/* 1: ch = e and f		*/ \
+	xor		rT0,rT0,rT2;	/* 1: S1 = S1 xor S1"		*/ \
+	andc		rT1,g,e;	/* 1: ch' = ~e and g		*/ \
+	lwz		rT2,off(rKP);	/* 1: K				*/ \
+	xor		rT3,rT3,rT1;	/* 1: ch = ch xor ch'		*/ \
+	add		h,h,rT0;	/* 1: temp1 = h + S1		*/ \
+	add		rT3,rT3,w;	/* 1: temp1' = ch + w		*/ \
+	rotrwi		rT0,a,2;	/* 1: S0 = a rotr 2		*/ \
+	add		h,h,rT3;	/* 1: temp1 = temp1 + temp1'	*/ \
+	rotrwi		rT1,a,13;	/* 1: S0' = a rotr 13		*/ \
+	add		h,h,rT2;	/* 1: temp1 = temp1 + K		*/ \
+	rotrwi		rT3,a,22;	/* 1: S0" = a rotr 22		*/ \
+	xor		rT0,rT0,rT1;	/* 1: S0 = S0 xor S0'		*/ \
+	add		d,d,h;		/* 1: d = d + temp1		*/ \
+	xor		rT3,rT0,rT3;	/* 1: S0 = S0 xor S0"		*/ \
+	evmergelo	w,w,w;		/*    shift W			*/ \
+	or		rT2,a,b;	/* 1: maj = a or b		*/ \
+	and		rT1,a,b;	/* 1: maj' = a and b		*/ \
+	and		rT2,rT2,c;	/* 1: maj = maj and c		*/ \
+	LOAD_DATA(w, off+4)		/* 2: W				*/ \
+	or		rT2,rT1,rT2;	/* 1: maj = maj or maj'		*/ \
+	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
+	add		rT3,rT3,rT2;	/* 1: temp2 = S0 + maj		*/ \
+	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
+	add		h,h,rT3;	/* 1: h = temp1 + temp2		*/ \
+	rotrwi		rT2,d,25;	/* 2: S1" = e rotr 25		*/ \
+	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
+	and		rT3,d,e;	/* 2: ch = e and f		*/ \
+	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
+	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
+	lwz		rT2,off+4(rKP);	/* 2: K				*/ \
+	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
+	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
+	add		rT3,rT3,w;	/* 2: temp1' = ch + w		*/ \
+	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
+	add		g,g,rT3;	/* 2: temp1 = temp1 + temp1'	*/ \
+	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
+	add		g,g,rT2;	/* 2: temp1 = temp1 + K		*/ \
+	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
+	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
+	or		rT2,h,a;	/* 2: maj = a or b		*/ \
+	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
+	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
+	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
+	add		c,c,g;		/* 2: d = d + temp1		*/ \
+	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
+	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
+	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
+
+#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
+	rotrwi		rT2,e,6;	/* 1: S1 = e rotr 6		*/ \
+	evmergelohi	rT0,w0,w1;	/*    w[-15]			*/ \
+	rotrwi		rT3,e,11;	/* 1: S1' = e rotr 11		*/ \
+	evsrwiu		rT1,rT0,3;	/*    s0 = w[-15] >> 3		*/ \
+	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
+	evrlwi		rT0,rT0,25;	/*    s0' = w[-15] rotr	7	*/ \
+	rotrwi		rT3,e,25;	/* 1: S1' = e rotr 25		*/ \
+	evxor		rT1,rT1,rT0;	/*    s0 = s0 xor s0'		*/ \
+	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
+	evrlwi		rT0,rT0,21;	/*    s0' = w[-15] rotr 18	*/ \
+	add		h,h,rT2;	/* 1: temp1 = h + S1		*/ \
+	evxor		rT0,rT0,rT1;	/*    s0 = s0 xor s0'		*/ \
+	and		rT2,e,f;	/* 1: ch = e and f		*/ \
+	evaddw		w0,w0,rT0;	/*    w = w[-16] + s0		*/ \
+	andc		rT3,g,e;	/* 1: ch' = ~e and g		*/ \
+	evsrwiu		rT0,w7,10;	/*    s1 = w[-2] >> 10		*/ \
+	xor		rT2,rT2,rT3;	/* 1: ch = ch xor ch'		*/ \
+	evrlwi		rT1,w7,15;	/*    s1' = w[-2] rotr 17	*/ \
+	add		h,h,rT2;	/* 1: temp1 = temp1 + ch	*/ \
+	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
+	rotrwi		rT2,a,2;	/* 1: S0 = a rotr 2		*/ \
+	evrlwi		rT1,w7,13;	/*    s1' = w[-2] rotr 19	*/ \
+	rotrwi		rT3,a,13;	/* 1: S0' = a rotr 13		*/ \
+	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
+	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
+	evldw		rT1,off(rKP);	/*    k				*/ \
+	rotrwi		rT3,a,22;	/* 1: S0' = a rotr 22		*/ \
+	evaddw		w0,w0,rT0;	/*    w = w + s1		*/ \
+	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
+	evmergelohi	rT0,w4,w5;	/*    w[-7]			*/ \
+	and		rT3,a,b;	/* 1: maj = a and b		*/ \
+	evaddw		w0,w0,rT0;	/*    w = w + w[-7]		*/ \
+	CMP_K##k##_LOOP							   \
+	add		rT2,rT2,rT3;	/* 1: temp2 = S0 + maj		*/ \
+	evaddw		rT1,rT1,w0;	/*    wk = w + k		*/ \
+	xor		rT3,a,b;	/* 1: maj = a xor b		*/ \
+	evmergehi	rT0,rT1,rT1;	/*    wk1/wk2			*/ \
+	and		rT3,rT3,c;	/* 1: maj = maj and c		*/ \
+	add		h,h,rT0;	/* 1: temp1 = temp1 + wk	*/ \
+	add		rT2,rT2,rT3;	/* 1: temp2 = temp2 + maj	*/ \
+	add		g,g,rT1;	/* 2: temp1 = temp1 + wk	*/ \
+	add		d,d,h;		/* 1: d = d + temp1		*/ \
+	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
+	add		h,h,rT2;	/* 1: h = temp1 + temp2		*/ \
+	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
+	rotrwi		rT2,d,25;	/* 2: S" = e rotr 25		*/ \
+	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
+	and		rT3,d,e;	/* 2: ch = e and f		*/ \
+	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
+	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
+	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
+	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
+	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
+	add		g,g,rT3;	/* 2: temp1 = temp1 + ch	*/ \
+	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
+	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
+	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
+	or		rT2,h,a;	/* 2: maj = a or b		*/ \
+	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
+	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
+	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
+	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
+	add		c,c,g;		/* 2: d = d + temp1		*/ \
+	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
+	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
+
+_GLOBAL(ppc_spe_sha256_transform)
+	INITIALIZE
+
+	mtctr		r5
+	lwz		rH0,0(rHP)
+	lwz		rH1,4(rHP)
+	lwz		rH2,8(rHP)
+	lwz		rH3,12(rHP)
+	lwz		rH4,16(rHP)
+	lwz		rH5,20(rHP)
+	lwz		rH6,24(rHP)
+	lwz		rH7,28(rHP)
+
+ppc_spe_sha256_main:
+	lis		rKP,PPC_SPE_SHA256_K@ha
+	addi		rKP,rKP,PPC_SPE_SHA256_K@l
+
+	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
+	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
+	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
+	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
+	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
+	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
+	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
+	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
+ppc_spe_sha256_16_rounds:
+	addi		rKP,rKP,64
+	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+		 rW0, rW1, rW4, rW5, rW7, N, 0)
+	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+		 rW1, rW2, rW5, rW6, rW0, N, 8)
+	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+		 rW2, rW3, rW6, rW7, rW1, N, 16)
+	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+		 rW3, rW4, rW7, rW0, rW2, N, 24)
+	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+		 rW4, rW5, rW0, rW1, rW3, N, 32)
+	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+		 rW5, rW6, rW1, rW2, rW4, N, 40)
+	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+		 rW6, rW7, rW2, rW3, rW5, N, 48)
+	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+		 rW7, rW0, rW3, rW4, rW6, C, 56)
+	bt		gt,ppc_spe_sha256_16_rounds
+
+	lwz		rW0,0(rHP)
+	NEXT_BLOCK
+	lwz		rW1,4(rHP)
+	lwz		rW2,8(rHP)
+	lwz		rW3,12(rHP)
+	lwz		rW4,16(rHP)
+	lwz		rW5,20(rHP)
+	lwz		rW6,24(rHP)
+	lwz		rW7,28(rHP)
+
+	add		rH0,rH0,rW0
+	stw		rH0,0(rHP)
+	add		rH1,rH1,rW1
+	stw		rH1,4(rHP)
+	add		rH2,rH2,rW2
+	stw		rH2,8(rHP)
+	add		rH3,rH3,rW3
+	stw		rH3,12(rHP)
+	add		rH4,rH4,rW4
+	stw		rH4,16(rHP)
+	add		rH5,rH5,rW5
+	stw		rH5,20(rHP)
+	add		rH6,rH6,rW6
+	stw		rH6,24(rHP)
+	add		rH7,rH7,rW7
+	stw		rH7,28(rHP)
+
+	bdnz		ppc_spe_sha256_main
+
+	FINALIZE
+	blr
+
+.data
+.align 5
+PPC_SPE_SHA256_K:
+	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/arch/powerpc/lib/crypto/sha256.c b/arch/powerpc/lib/crypto/sha256.c
new file mode 100644
index 000000000000..c05023c5acdd
--- /dev/null
+++ b/arch/powerpc/lib/crypto/sha256.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 Secure Hash Algorithm, SPE optimized
+ *
+ * Based on generic implementation. The assembler module takes care
+ * about the SPE registers so it can run from interrupt context.
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/switch_to.h>
+#include <crypto/internal/sha2.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 1024
+
+extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks);
+
+static void spe_begin(void)
+{
+	/* We just start SPE operations and will save SPE registers later. */
+	preempt_disable();
+	enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+	disable_kernel_spe();
+	/* reenable preemption */
+	preempt_enable();
+}
+
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+			const u8 *data, size_t nblocks)
+{
+	do {
+		/* cut input data into smaller blocks */
+		u32 unit = min_t(size_t, nblocks,
+				 MAX_BYTES / SHA256_BLOCK_SIZE);
+
+		spe_begin();
+		ppc_spe_sha256_transform(state, data, unit);
+		spe_end();
+
+		data += unit * SHA256_BLOCK_SIZE;
+		nblocks -= unit;
+	} while (nblocks);
+}
+EXPORT_SYMBOL(sha256_blocks_arch);
+
+bool sha256_is_arch_optimized(void)
+{
+	return true;
+}
+EXPORT_SYMBOL(sha256_is_arch_optimized);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm, SPE optimized");