- AVX2 (Advanced Vector Extensions 2)
- SHA-NI (SHA Extensions New Instructions)
-config CRYPTO_SHA256_SSSE3
- tristate "Hash functions: SHA-224 and SHA-256 (SSSE3/AVX/AVX2/SHA-NI)"
- depends on 64BIT
- select CRYPTO_SHA256
- select CRYPTO_HASH
- help
- SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX (Advanced Vector Extensions)
- - AVX2 (Advanced Vector Extensions 2)
- - SHA-NI (SHA Extensions New Instructions)
-
config CRYPTO_SHA512_SSSE3
tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)"
depends on 64BIT
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o
-obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
-sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ni_asm.o sha256_ssse3_glue.o
-
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
+++ /dev/null
-########################################################################
-# Implement fast SHA-256 with AVX1 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 1 block at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-
-.macro MY_ROR p1 p2
- shld $(32-(\p1)), \p2, \p2
-.endm
-
-################################
-
-# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
-# Load xmm with mem and byte swap each dword
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
- VMOVDQ \p2, \p1
- vpshufb \p3, \p1, \p1
-.endm
-
-################################
-
-X0 = %xmm4
-X1 = %xmm5
-X2 = %xmm6
-X3 = %xmm7
-
-XTMP0 = %xmm0
-XTMP1 = %xmm1
-XTMP2 = %xmm2
-XTMP3 = %xmm3
-XTMP4 = %xmm8
-XFER = %xmm9
-XTMP5 = %xmm11
-
-SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %xmm13
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-
-SRND = %rsi # clobbers INP
-c = %ecx
-d = %r8d
-e = %edx
-TBL = %r12
-a = %eax
-b = %ebx
-
-f = %r9d
-g = %r10d
-h = %r11d
-
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_XFER_SIZE = 16
-_XMM_SAVE_SIZE = 0
-
-_INP_END = 0
-_INP = _INP_END + _INP_END_SIZE
-_XFER = _INP + _INP_SIZE
-_XMM_SAVE = _XFER + _XFER_SIZE
-STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
- ## compute s0 four at a time and s1 two at a time
- ## compute W[-16] + W[-7] 4 at a time
-
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- ## compute s0
- vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpsrld $7, XTMP1, XTMP2
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpslld $(32-7), XTMP1, XTMP3
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- vpsrld $18, XTMP1, XTMP2 #
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- vpslld $(32-18), XTMP1, XTMP1
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- vpxor XTMP1, XTMP3, XTMP3 #
- add y0, y2 # y2 = S1 + CH
- add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- ## compute low s1
- vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
- xor g, y2 # y2 = f^g
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- vpxor XTMP3, XTMP2, XTMP2 #
- add y0, y2 # y2 = S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- ## compute high s1
- vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- vpxor XTMP3, XTMP2, XTMP2
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-## input is [rsp + _XFER + %1 * 4]
-.macro DO_ROUND round
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- xor e, y0 # y0 = e ^ (e >> (25-11))
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- and e, y2 # y2 = (f^g)&e
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- add y0, y2 # y2 = S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- offset = \round * 4 + _XFER #
- add offset(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
-.endm
-
-########################################################################
-## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
-## arg 1 : pointer to state
-## arg 2 : pointer to input data
-## arg 3 : Num blocks
-########################################################################
-.text
-SYM_TYPED_FUNC_START(sha256_transform_avx)
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbp
- movq %rsp, %rbp
-
- subq $STACK_SIZE, %rsp # allocate stack space
- and $~15, %rsp # align stack pointer
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS # pointer to end of data
- mov NUM_BLKS, _INP_END(%rsp)
-
- ## load initial digest
- mov 4*0(CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-.Lloop0:
- lea K256(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
-
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 16 each
- mov $3, SRND
-.align 16
-.Lloop1:
- vpaddd (TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 1*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 2*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 3*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- add $4*16, TBL
- FOUR_ROUNDS_AND_SCHED
-
- sub $1, SRND
- jne .Lloop1
-
- mov $2, SRND
-.Lloop2:
- vpaddd (TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- vpaddd 1*16(TBL), X1, XFER
- vmovdqa XFER, _XFER(%rsp)
- add $2*16, TBL
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- vmovdqa X2, X0
- vmovdqa X3, X1
-
- sub $1, SRND
- jne .Lloop2
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- mov _INP(%rsp), INP
- add $64, INP
- cmp _INP_END(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- popq %rbp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- RET
-SYM_FUNC_END(sha256_transform_avx)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
-
-.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
-.align 16
-# shuffle xBxA -> 00BA
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
-.align 16
-# shuffle xDxC -> DC00
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
+++ /dev/null
-########################################################################
-# Implement fast SHA-256 with AVX2 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 2 blocks at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-################################
-
-X0 = %ymm4
-X1 = %ymm5
-X2 = %ymm6
-X3 = %ymm7
-
-# XMM versions of above
-XWORD0 = %xmm4
-XWORD1 = %xmm5
-XWORD2 = %xmm6
-XWORD3 = %xmm7
-
-XTMP0 = %ymm0
-XTMP1 = %ymm1
-XTMP2 = %ymm2
-XTMP3 = %ymm3
-XTMP4 = %ymm8
-XFER = %ymm9
-XTMP5 = %ymm11
-
-SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %ymm13
-
-X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-c = %ecx
-d = %r8d
-e = %edx # clobbers NUM_BLKS
-y3 = %esi # clobbers INP
-
-SRND = CTX # SRND is same register as CTX
-
-a = %eax
-b = %ebx
-f = %r9d
-g = %r10d
-h = %r11d
-old_h = %r11d
-
-T1 = %r12d
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
-_XMM_SAVE_SIZE = 0
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_CTX_SIZE = 8
-
-_XFER = 0
-_XMM_SAVE = _XFER + _XFER_SIZE
-_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
-_INP = _INP_END + _INP_END_SIZE
-_CTX = _INP + _INP_SIZE
-STACK_SIZE = _CTX + _CTX_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
- X_ = X0
- X0 = X1
- X1 = X2
- X2 = X3
- X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
- old_h = h
- TMP_ = h
- h = g
- g = f
- f = e
- e = d
- d = c
- c = b
- b = a
- a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED disp
-################################### RND N + 0 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
-
- addl \disp(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
- vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
- vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
-
- and e, y2 # y2 = (f^g)&e # CH
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- add h, d # d = k + w + h + d # --
-
- and b, y3 # y3 = (a|c)&b # MAJA
- vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
-
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- vpsrld $7, XTMP1, XTMP2
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
-
- add y0, y2 # y2 = S1 + CH # --
- vpslld $(32-7), XTMP1, XTMP3
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
-
- vpsrld $18, XTMP1, XTMP2
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
-
- ROTATE_ARGS
-
-################################### RND N + 1 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- offset = \disp + 1*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
-
- vpslld $(32-18), XTMP1, XTMP1
- and b, y3 # y3 = (a|c)&b # MAJA
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
-
- vpxor XTMP1, XTMP3, XTMP3
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
- vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
-
-
- ROTATE_ARGS
-
-################################### RND N + 2 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- offset = \disp + 2*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
-
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- or c, y3 # y3 = a|c # MAJA
- mov f, y2 # y2 = f # CH
- xor g, y2 # y2 = f^g # CH
-
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
- and e, y2 # y2 = (f^g)&e # CH
-
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- vpxor XTMP3, XTMP2, XTMP2
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a ,T1 # T1 = (a >> 2) # S0
- vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
- vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
-
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1,h # h = k + w + h + S0 # --
- add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3,h # h = t1 + S0 + MAJ # --
-
-
- ROTATE_ARGS
-
-################################### RND N + 3 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- offset = \disp + 3*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpxor XTMP3, XTMP2, XTMP2
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- add y0, y2 # y2 = S1 + CH # --
-
- vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
-
- vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
-
- add y1, h # h = k + w + h + S0 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-.macro DO_4ROUNDS disp
-################################### RND N + 0 ###########################
-
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- addl \disp(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 1 ###########################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*1 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 2 ##############################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*2 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 3 ###########################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*3 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
-
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3, h # h = t1 + S0 + MAJ # --
-
- ROTATE_ARGS
-
-.endm
-
-########################################################################
-## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
-## arg 1 : pointer to state
-## arg 2 : pointer to input data
-## arg 3 : Num blocks
-########################################################################
-.text
-SYM_TYPED_FUNC_START(sha256_transform_rorx)
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
- push %rbp
- mov %rsp, %rbp
-
- subq $STACK_SIZE, %rsp
- and $-32, %rsp # align rsp to 32 byte boundary
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
- mov NUM_BLKS, _INP_END(%rsp)
-
- cmp NUM_BLKS, INP
- je .Lonly_one_block
-
- ## load initial digest
- mov (CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-
- mov CTX, _CTX(%rsp)
-
-.Lloop0:
- ## Load first 16 dwords from two blocks
- VMOVDQ 0*32(INP),XTMP0
- VMOVDQ 1*32(INP),XTMP1
- VMOVDQ 2*32(INP),XTMP2
- VMOVDQ 3*32(INP),XTMP3
-
- ## byte swap data
- vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
- vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
- vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
- vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
-
- ## transpose data into high/low halves
- vperm2i128 $0x20, XTMP2, XTMP0, X0
- vperm2i128 $0x31, XTMP2, XTMP0, X1
- vperm2i128 $0x20, XTMP3, XTMP1, X2
- vperm2i128 $0x31, XTMP3, XTMP1, X3
-
-.Llast_block_enter:
- add $64, INP
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 12 each
- xor SRND, SRND
-
-.align 16
-.Lloop1:
- leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
-
- leaq K256+1*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
-
- leaq K256+2*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
-
- leaq K256+3*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
-
- add $4*32, SRND
- cmp $3*4*32, SRND
- jb .Lloop1
-
-.Lloop2:
- ## Do last 16 rounds with no scheduling
- leaq K256+0*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- DO_4ROUNDS (_XFER + 0*32)
-
- leaq K256+1*32(%rip), INP
- vpaddd (INP, SRND), X1, XFER
- vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- DO_4ROUNDS (_XFER + 1*32)
- add $2*32, SRND
-
- vmovdqa X2, X0
- vmovdqa X3, X1
-
- cmp $4*4*32, SRND
- jb .Lloop2
-
- mov _CTX(%rsp), CTX
- mov _INP(%rsp), INP
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- cmp _INP_END(%rsp), INP
- ja .Ldone_hash
-
- #### Do second block using previously scheduled results
- xor SRND, SRND
-.align 16
-.Lloop3:
- DO_4ROUNDS (_XFER + 0*32 + 16)
- DO_4ROUNDS (_XFER + 1*32 + 16)
- add $2*32, SRND
- cmp $4*4*32, SRND
- jb .Lloop3
-
- mov _CTX(%rsp), CTX
- mov _INP(%rsp), INP
- add $64, INP
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- cmp _INP_END(%rsp), INP
- jb .Lloop0
- ja .Ldone_hash
-
-.Ldo_last_block:
- VMOVDQ 0*16(INP),XWORD0
- VMOVDQ 1*16(INP),XWORD1
- VMOVDQ 2*16(INP),XWORD2
- VMOVDQ 3*16(INP),XWORD3
-
- vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
- vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
- vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
- vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
-
- jmp .Llast_block_enter
-
-.Lonly_one_block:
-
- ## load initial digest
- mov (4*0)(CTX),a
- mov (4*1)(CTX),b
- mov (4*2)(CTX),c
- mov (4*3)(CTX),d
- mov (4*4)(CTX),e
- mov (4*5)(CTX),f
- mov (4*6)(CTX),g
- mov (4*7)(CTX),h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-
- mov CTX, _CTX(%rsp)
- jmp .Ldo_last_block
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- pop %rbp
-
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- vzeroupper
- RET
-SYM_FUNC_END(sha256_transform_rorx)
-
-.section .rodata.cst512.K256, "aM", @progbits, 512
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
-.align 32
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
-
-# shuffle xBxA -> 00BA
-.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
-.align 32
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-# shuffle xDxC -> DC00
-.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
-.align 32
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
+++ /dev/null
-########################################################################
-# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-## assume buffers not aligned
-#define MOVDQ movdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-################################
-
-# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
-# Load xmm with mem and byte swap each dword
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
- MOVDQ \p2, \p1
- pshufb \p3, \p1
-.endm
-
-################################
-
-X0 = %xmm4
-X1 = %xmm5
-X2 = %xmm6
-X3 = %xmm7
-
-XTMP0 = %xmm0
-XTMP1 = %xmm1
-XTMP2 = %xmm2
-XTMP3 = %xmm3
-XTMP4 = %xmm8
-XFER = %xmm9
-
-SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %xmm12
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-
-SRND = %rsi # clobbers INP
-c = %ecx
-d = %r8d
-e = %edx
-TBL = %r12
-a = %eax
-b = %ebx
-
-f = %r9d
-g = %r10d
-h = %r11d
-
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_XFER_SIZE = 16
-_XMM_SAVE_SIZE = 0
-
-_INP_END = 0
-_INP = _INP_END + _INP_END_SIZE
-_XFER = _INP + _INP_SIZE
-_XMM_SAVE = _XFER + _XFER_SIZE
-STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
- ## compute s0 four at a time and s1 two at a time
- ## compute W[-16] + W[-7] 4 at a time
- movdqa X3, XTMP0
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- palignr $4, X2, XTMP0 # XTMP0 = W[-7]
- ror $(22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- movdqa X1, XTMP1
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- ## compute s0
- palignr $4, X0, XTMP1 # XTMP1 = W[-15]
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
- movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pslld $(32-7), XTMP1 #
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- psrld $7, XTMP2 #
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- #
- ROTATE_ARGS #
- movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
- ror $(25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(22-13), y1 # y1 = a >> (22-13)
- pslld $(32-18), XTMP3 #
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- psrld $18, XTMP2 #
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- pxor XTMP3, XTMP1
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
- add y0, y2 # y2 = S1 + CH
- add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pxor XTMP4, XTMP1 # XTMP1 = s0
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- ## compute low s1
- pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
-
- ROTATE_ARGS
- movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- ror $(25-11), y0 # y0 = e >> (25-11)
- movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
- xor e, y0 # y0 = e ^ (e >> (25-11))
- ror $(22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
- xor g, y2 # y2 = f^g
- psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- pxor XTMP3, XTMP2
- add y0, y2 # y2 = S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- ## compute high s1
- pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- #
- ROTATE_ARGS #
- movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
- ror $(22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
- and e, y2 # y2 = (f^g)&e
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- pxor XTMP3, XTMP2 #
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
- add y0, y2 # y2 = S1 + CH
- add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- pxor XTMP2, X0 # X0 = s1 {xDxC}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
-
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-## input is [rsp + _XFER + %1 * 4]
-.macro DO_ROUND round
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- xor e, y0 # y0 = e ^ (e >> (25-11))
- ror $(22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- and e, y2 # y2 = (f^g)&e
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- add y0, y2 # y2 = S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- offset = \round * 4 + _XFER
- add offset(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
-.endm
-
-########################################################################
-## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
-## int blocks);
-## arg 1 : pointer to state
-## (struct sha256_state is assumed to begin with u32 state[8])
-## arg 2 : pointer to input data
-## arg 3 : Num blocks
-########################################################################
-.text
-SYM_TYPED_FUNC_START(sha256_transform_ssse3)
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbp
- mov %rsp, %rbp
-
- subq $STACK_SIZE, %rsp
- and $~15, %rsp
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS
- mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
-
- ## load initial digest
- mov 4*0(CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- movdqa _SHUF_00BA(%rip), SHUF_00BA
- movdqa _SHUF_DC00(%rip), SHUF_DC00
-
-.Lloop0:
- lea K256(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
-
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 16 each
- mov $3, SRND
-.align 16
-.Lloop1:
- movdqa (TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 1*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 2*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 3*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- add $4*16, TBL
- FOUR_ROUNDS_AND_SCHED
-
- sub $1, SRND
- jne .Lloop1
-
- mov $2, SRND
-.Lloop2:
- paddd (TBL), X0
- movdqa X0, _XFER(%rsp)
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
- paddd 1*16(TBL), X1
- movdqa X1, _XFER(%rsp)
- add $2*16, TBL
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- movdqa X2, X0
- movdqa X3, X1
-
- sub $1, SRND
- jne .Lloop2
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- mov _INP(%rsp), INP
- add $64, INP
- cmp _INP_END(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- popq %rbp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
-
- RET
-SYM_FUNC_END(sha256_transform_ssse3)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
-
-.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
-.align 16
-# shuffle xBxA -> 00BA
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
-.align 16
-# shuffle xDxC -> DC00
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
+++ /dev/null
-/*
- * Intel SHA Extensions optimized implementation of a SHA-256 update function
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Sean Gulley <sean.m.gulley@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-#define DIGEST_PTR %rdi /* 1st arg */
-#define DATA_PTR %rsi /* 2nd arg */
-#define NUM_BLKS %rdx /* 3rd arg */
-
-#define SHA256CONSTANTS %rax
-
-#define MSG %xmm0 /* sha256rnds2 implicit operand */
-#define STATE0 %xmm1
-#define STATE1 %xmm2
-#define MSG0 %xmm3
-#define MSG1 %xmm4
-#define MSG2 %xmm5
-#define MSG3 %xmm6
-#define TMP %xmm7
-
-#define SHUF_MASK %xmm8
-
-#define ABEF_SAVE %xmm9
-#define CDGH_SAVE %xmm10
-
-.macro do_4rounds i, m0, m1, m2, m3
-.if \i < 16
- movdqu \i*4(DATA_PTR), \m0
- pshufb SHUF_MASK, \m0
-.endif
- movdqa (\i-32)*4(SHA256CONSTANTS), MSG
- paddd \m0, MSG
- sha256rnds2 STATE0, STATE1
-.if \i >= 12 && \i < 60
- movdqa \m0, TMP
- palignr $4, \m3, TMP
- paddd TMP, \m1
- sha256msg2 \m0, \m1
-.endif
- punpckhqdq MSG, MSG
- sha256rnds2 STATE1, STATE0
-.if \i >= 4 && \i < 52
- sha256msg1 \m0, \m3
-.endif
-.endm
-
-/*
- * Intel SHA Extensions optimized implementation of a SHA-256 update function
- *
- * The function takes a pointer to the current hash values, a pointer to the
- * input data, and a number of 64 byte blocks to process. Once all blocks have
- * been processed, the digest pointer is updated with the resulting hash value.
- * The function only processes complete blocks, there is no functionality to
- * store partial blocks. All message padding and hash value initialization must
- * be done outside the update function.
- *
- * void sha256_ni_transform(uint32_t *digest, const void *data,
- uint32_t numBlocks);
- * digest : pointer to digest
- * data: pointer to input data
- * numBlocks: Number of blocks to process
- */
-
-.text
-SYM_TYPED_FUNC_START(sha256_ni_transform)
-
- shl $6, NUM_BLKS /* convert to bytes */
- jz .Ldone_hash
- add DATA_PTR, NUM_BLKS /* pointer to end of data */
-
- /*
- * load initial hash values
- * Need to reorder these appropriately
- * DCBA, HGFE -> ABEF, CDGH
- */
- movdqu 0*16(DIGEST_PTR), STATE0 /* DCBA */
- movdqu 1*16(DIGEST_PTR), STATE1 /* HGFE */
-
- movdqa STATE0, TMP
- punpcklqdq STATE1, STATE0 /* FEBA */
- punpckhqdq TMP, STATE1 /* DCHG */
- pshufd $0x1B, STATE0, STATE0 /* ABEF */
- pshufd $0xB1, STATE1, STATE1 /* CDGH */
-
- movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
- lea K256+32*4(%rip), SHA256CONSTANTS
-
-.Lloop0:
- /* Save hash values for addition after rounds */
- movdqa STATE0, ABEF_SAVE
- movdqa STATE1, CDGH_SAVE
-
-.irp i, 0, 16, 32, 48
- do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
- do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
- do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
- do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
-.endr
-
- /* Add current hash values with previously saved */
- paddd ABEF_SAVE, STATE0
- paddd CDGH_SAVE, STATE1
-
- /* Increment data pointer and loop if more to process */
- add $64, DATA_PTR
- cmp NUM_BLKS, DATA_PTR
- jne .Lloop0
-
- /* Write hash values back in the correct order */
- movdqa STATE0, TMP
- punpcklqdq STATE1, STATE0 /* GHEF */
- punpckhqdq TMP, STATE1 /* ABCD */
- pshufd $0xB1, STATE0, STATE0 /* HGFE */
- pshufd $0x1B, STATE1, STATE1 /* DCBA */
-
- movdqu STATE1, 0*16(DIGEST_PTR)
- movdqu STATE0, 1*16(DIGEST_PTR)
-
-.Ldone_hash:
-
- RET
-SYM_FUNC_END(sha256_ni_transform)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
+++ /dev/null
-/*
- * Cryptographic API.
- *
- * Glue code for the SHA256 Secure Hash Algorithm assembler implementations
- * using SSSE3, AVX, AVX2, and SHA-NI instructions.
- *
- * This file is based on sha256_generic.c
- *
- * Copyright (C) 2013 Intel Corporation.
- *
- * Author:
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha256_transform_ssse3(struct crypto_sha256_state *state,
- const u8 *data, int blocks);
-
-static const struct x86_cpu_id module_cpu_ids[] = {
- X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
-
-static int _sha256_update(struct shash_desc *desc, const u8 *data,
- unsigned int len,
- sha256_block_fn *sha256_xform)
-{
- int remain;
-
- /*
- * Make sure struct crypto_sha256_state begins directly with the SHA256
- * 256-bit internal state, as this is what the asm functions expect.
- */
- BUILD_BUG_ON(offsetof(struct crypto_sha256_state, state) != 0);
-
- kernel_fpu_begin();
- remain = sha256_base_do_update_blocks(desc, data, len, sha256_xform);
- kernel_fpu_end();
-
- return remain;
-}
-
-static int sha256_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out, sha256_block_fn *sha256_xform)
-{
- kernel_fpu_begin();
- sha256_base_do_finup(desc, data, len, sha256_xform);
- kernel_fpu_end();
-
- return sha256_base_finish(desc, out);
-}
-
-static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return _sha256_update(desc, data, len, sha256_transform_ssse3);
-}
-
-static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_finup(desc, data, len, out, sha256_transform_ssse3);
-}
-
-static int sha256_ssse3_digest(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_base_init(desc) ?:
- sha256_ssse3_finup(desc, data, len, out);
-}
-
-static struct shash_alg sha256_ssse3_algs[] = { {
- .digestsize = SHA256_DIGEST_SIZE,
- .init = sha256_base_init,
- .update = sha256_ssse3_update,
- .finup = sha256_ssse3_finup,
- .digest = sha256_ssse3_digest,
- .descsize = sizeof(struct crypto_sha256_state),
- .base = {
- .cra_name = "sha256",
- .cra_driver_name = "sha256-ssse3",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA224_DIGEST_SIZE,
- .init = sha224_base_init,
- .update = sha256_ssse3_update,
- .finup = sha256_ssse3_finup,
- .descsize = sizeof(struct crypto_sha256_state),
- .base = {
- .cra_name = "sha224",
- .cra_driver_name = "sha224-ssse3",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA224_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int register_sha256_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- return crypto_register_shashes(sha256_ssse3_algs,
- ARRAY_SIZE(sha256_ssse3_algs));
- return 0;
-}
-
-static void unregister_sha256_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shashes(sha256_ssse3_algs,
- ARRAY_SIZE(sha256_ssse3_algs));
-}
-
-asmlinkage void sha256_transform_avx(struct crypto_sha256_state *state,
- const u8 *data, int blocks);
-
-static int sha256_avx_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return _sha256_update(desc, data, len, sha256_transform_avx);
-}
-
-static int sha256_avx_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_finup(desc, data, len, out, sha256_transform_avx);
-}
-
-static int sha256_avx_digest(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_base_init(desc) ?:
- sha256_avx_finup(desc, data, len, out);
-}
-
-static struct shash_alg sha256_avx_algs[] = { {
- .digestsize = SHA256_DIGEST_SIZE,
- .init = sha256_base_init,
- .update = sha256_avx_update,
- .finup = sha256_avx_finup,
- .digest = sha256_avx_digest,
- .descsize = sizeof(struct crypto_sha256_state),
- .base = {
- .cra_name = "sha256",
- .cra_driver_name = "sha256-avx",
- .cra_priority = 160,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA224_DIGEST_SIZE,
- .init = sha224_base_init,
- .update = sha256_avx_update,
- .finup = sha256_avx_finup,
- .descsize = sizeof(struct crypto_sha256_state),
- .base = {
- .cra_name = "sha224",
- .cra_driver_name = "sha224-avx",
- .cra_priority = 160,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA224_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static bool avx_usable(void)
-{
- if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
- if (boot_cpu_has(X86_FEATURE_AVX))
- pr_info("AVX detected but unusable.\n");
- return false;
- }
-
- return true;
-}
-
-static int register_sha256_avx(void)
-{
- if (avx_usable())
- return crypto_register_shashes(sha256_avx_algs,
- ARRAY_SIZE(sha256_avx_algs));
- return 0;
-}
-
-static void unregister_sha256_avx(void)
-{
- if (avx_usable())
- crypto_unregister_shashes(sha256_avx_algs,
- ARRAY_SIZE(sha256_avx_algs));
-}
-
-asmlinkage void sha256_transform_rorx(struct crypto_sha256_state *state,
- const u8 *data, int blocks);
-
-static int sha256_avx2_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return _sha256_update(desc, data, len, sha256_transform_rorx);
-}
-
-static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_finup(desc, data, len, out, sha256_transform_rorx);
-}
-
-static int sha256_avx2_digest(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_base_init(desc) ?:
- sha256_avx2_finup(desc, data, len, out);
-}
-
-static struct shash_alg sha256_avx2_algs[] = { {
- .digestsize = SHA256_DIGEST_SIZE,
- .init = sha256_base_init,
- .update = sha256_avx2_update,
- .finup = sha256_avx2_finup,
- .digest = sha256_avx2_digest,
- .descsize = sizeof(struct crypto_sha256_state),
- .base = {
- .cra_name = "sha256",
- .cra_driver_name = "sha256-avx2",
- .cra_priority = 170,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA224_DIGEST_SIZE,
- .init = sha224_base_init,
- .update = sha256_avx2_update,
- .finup = sha256_avx2_finup,
- .descsize = sizeof(struct crypto_sha256_state),
- .base = {
- .cra_name = "sha224",
- .cra_driver_name = "sha224-avx2",
- .cra_priority = 170,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA224_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static bool avx2_usable(void)
-{
- if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_BMI2))
- return true;
-
- return false;
-}
-
-static int register_sha256_avx2(void)
-{
- if (avx2_usable())
- return crypto_register_shashes(sha256_avx2_algs,
- ARRAY_SIZE(sha256_avx2_algs));
- return 0;
-}
-
-static void unregister_sha256_avx2(void)
-{
- if (avx2_usable())
- crypto_unregister_shashes(sha256_avx2_algs,
- ARRAY_SIZE(sha256_avx2_algs));
-}
-
-asmlinkage void sha256_ni_transform(struct crypto_sha256_state *digest,
- const u8 *data, int rounds);
-
-static int sha256_ni_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return _sha256_update(desc, data, len, sha256_ni_transform);
-}
-
-static int sha256_ni_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_finup(desc, data, len, out, sha256_ni_transform);
-}
-
-static int sha256_ni_digest(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_base_init(desc) ?:
- sha256_ni_finup(desc, data, len, out);
-}
-
-static struct shash_alg sha256_ni_algs[] = { {
- .digestsize = SHA256_DIGEST_SIZE,
- .init = sha256_base_init,
- .update = sha256_ni_update,
- .finup = sha256_ni_finup,
- .digest = sha256_ni_digest,
- .descsize = sizeof(struct crypto_sha256_state),
- .base = {
- .cra_name = "sha256",
- .cra_driver_name = "sha256-ni",
- .cra_priority = 250,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA224_DIGEST_SIZE,
- .init = sha224_base_init,
- .update = sha256_ni_update,
- .finup = sha256_ni_finup,
- .descsize = sizeof(struct crypto_sha256_state),
- .base = {
- .cra_name = "sha224",
- .cra_driver_name = "sha224-ni",
- .cra_priority = 250,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA224_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int register_sha256_ni(void)
-{
- if (boot_cpu_has(X86_FEATURE_SHA_NI))
- return crypto_register_shashes(sha256_ni_algs,
- ARRAY_SIZE(sha256_ni_algs));
- return 0;
-}
-
-static void unregister_sha256_ni(void)
-{
- if (boot_cpu_has(X86_FEATURE_SHA_NI))
- crypto_unregister_shashes(sha256_ni_algs,
- ARRAY_SIZE(sha256_ni_algs));
-}
-
-static int __init sha256_ssse3_mod_init(void)
-{
- if (!x86_match_cpu(module_cpu_ids))
- return -ENODEV;
-
- if (register_sha256_ssse3())
- goto fail;
-
- if (register_sha256_avx()) {
- unregister_sha256_ssse3();
- goto fail;
- }
-
- if (register_sha256_avx2()) {
- unregister_sha256_avx();
- unregister_sha256_ssse3();
- goto fail;
- }
-
- if (register_sha256_ni()) {
- unregister_sha256_avx2();
- unregister_sha256_avx();
- unregister_sha256_ssse3();
- goto fail;
- }
-
- return 0;
-fail:
- return -ENODEV;
-}
-
-static void __exit sha256_ssse3_mod_fini(void)
-{
- unregister_sha256_ni();
- unregister_sha256_avx2();
- unregister_sha256_avx();
- unregister_sha256_ssse3();
-}
-
-module_init(sha256_ssse3_mod_init);
-module_exit(sha256_ssse3_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-
-MODULE_ALIAS_CRYPTO("sha256");
-MODULE_ALIAS_CRYPTO("sha256-ssse3");
-MODULE_ALIAS_CRYPTO("sha256-avx");
-MODULE_ALIAS_CRYPTO("sha256-avx2");
-MODULE_ALIAS_CRYPTO("sha224");
-MODULE_ALIAS_CRYPTO("sha224-ssse3");
-MODULE_ALIAS_CRYPTO("sha224-avx");
-MODULE_ALIAS_CRYPTO("sha224-avx2");
-MODULE_ALIAS_CRYPTO("sha256-ni");
-MODULE_ALIAS_CRYPTO("sha224-ni");
depends on 64BIT
default CRYPTO_LIB_POLY1305
select CRYPTO_ARCH_HAVE_LIB_POLY1305
+
+config CRYPTO_SHA256_X86_64
+ tristate
+ depends on 64BIT
+ default CRYPTO_LIB_SHA256
+ select CRYPTO_ARCH_HAVE_LIB_SHA256
+ select CRYPTO_LIB_SHA256_GENERIC
poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
targets += poly1305-x86_64-cryptogams.S
+obj-$(CONFIG_CRYPTO_SHA256_X86_64) += sha256-x86_64.o
+sha256-x86_64-y := sha256.o sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256-ni-asm.o
+
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $< > $@
--- /dev/null
+########################################################################
+# Implement fast SHA-256 with AVX1 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 block at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+
+.macro MY_ROR p1 p2
+ shld $(32-(\p1)), \p2, \p2
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+ VMOVDQ \p2, \p1
+ vpshufb \p3, \p1, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+XTMP5 = %xmm11
+
+SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm13
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+
+SRND = %rsi # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP = _INP_END + _INP_END_SIZE
+_XFER = _INP + _INP_SIZE
+_XMM_SAVE = _XFER + _XFER_SIZE
+STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+ ## compute s0 four at a time and s1 two at a time
+ ## compute W[-16] + W[-7] 4 at a time
+
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ## compute s0
+ vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpsrld $7, XTMP1, XTMP2
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpslld $(32-7), XTMP1, XTMP3
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ vpsrld $18, XTMP1, XTMP2 #
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ vpslld $(32-18), XTMP1, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ vpxor XTMP1, XTMP3, XTMP3 #
+ add y0, y2 # y2 = S1 + CH
+ add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ ## compute low s1
+ vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+ xor g, y2 # y2 = f^g
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ vpxor XTMP3, XTMP2, XTMP2 #
+ add y0, y2 # y2 = S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ ## compute high s1
+ vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ vpxor XTMP3, XTMP2, XTMP2
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and e, y2 # y2 = (f^g)&e
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ add y0, y2 # y2 = S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ offset = \round * 4 + _XFER #
+ add offset(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_avx(u32 state[SHA256_STATE_WORDS],
+## const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_avx)
+ ANNOTATE_NOENDBR # since this is called only via static_call
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ movq %rsp, %rbp
+
+ subq $STACK_SIZE, %rsp # allocate stack space
+ and $~15, %rsp # align stack pointer
+
+ shl $6, NUM_BLKS # convert to bytes
+ jz .Ldone_hash
+ add INP, NUM_BLKS # pointer to end of data
+ mov NUM_BLKS, _INP_END(%rsp)
+
+ ## load initial digest
+ mov 4*0(CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+.Lloop0:
+ lea K256(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
+
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov $3, SRND
+.align 16
+.Lloop1:
+ vpaddd (TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 1*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 2*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 3*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ add $4*16, TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ sub $1, SRND
+ jne .Lloop1
+
+ mov $2, SRND
+.Lloop2:
+ vpaddd (TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vpaddd 1*16(TBL), X1, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ add $2*16, TBL
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vmovdqa X2, X0
+ vmovdqa X3, X1
+
+ sub $1, SRND
+ jne .Lloop2
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ mov _INP(%rsp), INP
+ add $64, INP
+ cmp _INP_END(%rsp), INP
+ jne .Lloop0
+
+.Ldone_hash:
+
+ mov %rbp, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ RET
+SYM_FUNC_END(sha256_transform_avx)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
--- /dev/null
+########################################################################
+# Implement fast SHA-256 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 2 blocks at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+################################
+
+X0 = %ymm4
+X1 = %ymm5
+X2 = %ymm6
+X3 = %ymm7
+
+# XMM versions of above
+XWORD0 = %xmm4
+XWORD1 = %xmm5
+XWORD2 = %xmm6
+XWORD3 = %xmm7
+
+XTMP0 = %ymm0
+XTMP1 = %ymm1
+XTMP2 = %ymm2
+XTMP3 = %ymm3
+XTMP4 = %ymm8
+XFER = %ymm9
+XTMP5 = %ymm11
+
+SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %ymm13
+
+X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+c = %ecx
+d = %r8d
+e = %edx # clobbers NUM_BLKS
+y3 = %esi # clobbers INP
+
+SRND = CTX # SRND is same register as CTX
+
+a = %eax
+b = %ebx
+f = %r9d
+g = %r10d
+h = %r11d
+old_h = %r11d
+
+T1 = %r12d
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
+_XMM_SAVE_SIZE = 0
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_CTX_SIZE = 8
+
+_XFER = 0
+_XMM_SAVE = _XFER + _XFER_SIZE
+_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
+_INP = _INP_END + _INP_END_SIZE
+_CTX = _INP + _INP_SIZE
+STACK_SIZE = _CTX + _CTX_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+ X_ = X0
+ X0 = X1
+ X1 = X2
+ X2 = X3
+ X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+ old_h = h
+ TMP_ = h
+ h = g
+ g = f
+ f = e
+ e = d
+ d = c
+ c = b
+ b = a
+ a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED disp
+################################### RND N + 0 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+
+ addl \disp(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+ vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+ vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+
+ and e, y2 # y2 = (f^g)&e # CH
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ vpsrld $7, XTMP1, XTMP2
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+
+ add y0, y2 # y2 = S1 + CH # --
+ vpslld $(32-7), XTMP1, XTMP3
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
+
+ vpsrld $18, XTMP1, XTMP2
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+
+ ROTATE_ARGS
+
+################################### RND N + 1 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ offset = \disp + 1*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+
+ vpslld $(32-18), XTMP1, XTMP1
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+
+ vpxor XTMP1, XTMP3, XTMP3
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
+ vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+
+
+ ROTATE_ARGS
+
+################################### RND N + 2 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ offset = \disp + 2*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ xor g, y2 # y2 = f^g # CH
+
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
+ and e, y2 # y2 = (f^g)&e # CH
+
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ vpxor XTMP3, XTMP2, XTMP2
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a ,T1 # T1 = (a >> 2) # S0
+ vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+ vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1,h # h = k + w + h + S0 # --
+ add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3,h # h = t1 + S0 + MAJ # --
+
+
+ ROTATE_ARGS
+
+################################### RND N + 3 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ offset = \disp + 3*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpxor XTMP3, XTMP2, XTMP2
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ add y0, y2 # y2 = S1 + CH # --
+
+ vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+
+ vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+
+ add y1, h # h = k + w + h + S0 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+.macro DO_4ROUNDS disp
+################################### RND N + 0 ###########################
+
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ addl \disp(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 1 ###########################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*1 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 2 ##############################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*2 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 3 ###########################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*3 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ ROTATE_ARGS
+
+.endm
+
+########################################################################
+## void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS],
+## const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_rorx)
+ ANNOTATE_NOENDBR # since this is called only via static_call
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ push %rbp
+ mov %rsp, %rbp
+
+ subq $STACK_SIZE, %rsp
+ and $-32, %rsp # align rsp to 32 byte boundary
+
+ shl $6, NUM_BLKS # convert to bytes
+ jz .Ldone_hash
+ lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
+ mov NUM_BLKS, _INP_END(%rsp)
+
+ cmp NUM_BLKS, INP
+ je .Lonly_one_block
+
+ ## load initial digest
+ mov (CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+
+ mov CTX, _CTX(%rsp)
+
+.Lloop0:
+ ## Load first 16 dwords from two blocks
+ VMOVDQ 0*32(INP),XTMP0
+ VMOVDQ 1*32(INP),XTMP1
+ VMOVDQ 2*32(INP),XTMP2
+ VMOVDQ 3*32(INP),XTMP3
+
+ ## byte swap data
+ vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
+ vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
+ vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
+ vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
+
+ ## transpose data into high/low halves
+ vperm2i128 $0x20, XTMP2, XTMP0, X0
+ vperm2i128 $0x31, XTMP2, XTMP0, X1
+ vperm2i128 $0x20, XTMP3, XTMP1, X2
+ vperm2i128 $0x31, XTMP3, XTMP1, X3
+
+.Llast_block_enter:
+ add $64, INP
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 12 each
+ xor SRND, SRND
+
+.align 16
+.Lloop1:
+ leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
+
+ leaq K256+1*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
+
+ leaq K256+2*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
+
+ leaq K256+3*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
+
+ add $4*32, SRND
+ cmp $3*4*32, SRND
+ jb .Lloop1
+
+.Lloop2:
+ ## Do last 16 rounds with no scheduling
+ leaq K256+0*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+ DO_4ROUNDS (_XFER + 0*32)
+
+ leaq K256+1*32(%rip), INP
+ vpaddd (INP, SRND), X1, XFER
+ vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+ DO_4ROUNDS (_XFER + 1*32)
+ add $2*32, SRND
+
+ vmovdqa X2, X0
+ vmovdqa X3, X1
+
+ cmp $4*4*32, SRND
+ jb .Lloop2
+
+ mov _CTX(%rsp), CTX
+ mov _INP(%rsp), INP
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ cmp _INP_END(%rsp), INP
+ ja .Ldone_hash
+
+ #### Do second block using previously scheduled results
+ xor SRND, SRND
+.align 16
+.Lloop3:
+ DO_4ROUNDS (_XFER + 0*32 + 16)
+ DO_4ROUNDS (_XFER + 1*32 + 16)
+ add $2*32, SRND
+ cmp $4*4*32, SRND
+ jb .Lloop3
+
+ mov _CTX(%rsp), CTX
+ mov _INP(%rsp), INP
+ add $64, INP
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ cmp _INP_END(%rsp), INP
+ jb .Lloop0
+ ja .Ldone_hash
+
+.Ldo_last_block:
+ VMOVDQ 0*16(INP),XWORD0
+ VMOVDQ 1*16(INP),XWORD1
+ VMOVDQ 2*16(INP),XWORD2
+ VMOVDQ 3*16(INP),XWORD3
+
+ vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
+ vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
+ vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
+ vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
+
+ jmp .Llast_block_enter
+
+.Lonly_one_block:
+
+ ## load initial digest
+ mov (4*0)(CTX),a
+ mov (4*1)(CTX),b
+ mov (4*2)(CTX),c
+ mov (4*3)(CTX),d
+ mov (4*4)(CTX),e
+ mov (4*5)(CTX),f
+ mov (4*6)(CTX),g
+ mov (4*7)(CTX),h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+
+ mov CTX, _CTX(%rsp)
+ jmp .Ldo_last_block
+
+.Ldone_hash:
+
+ mov %rbp, %rsp
+ pop %rbp
+
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ vzeroupper
+ RET
+SYM_FUNC_END(sha256_transform_rorx)
+
+.section .rodata.cst512.K256, "aM", @progbits, 512
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
+
+# shuffle xBxA -> 00BA
+.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
+.align 32
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+# shuffle xDxC -> DC00
+.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
+.align 32
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
--- /dev/null
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+
+#define STATE_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define SHA256CONSTANTS %rax
+
+#define MSG %xmm0 /* sha256rnds2 implicit operand */
+#define STATE0 %xmm1
+#define STATE1 %xmm2
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define TMP %xmm7
+
+#define SHUF_MASK %xmm8
+
+#define ABEF_SAVE %xmm9
+#define CDGH_SAVE %xmm10
+
+.macro do_4rounds i, m0, m1, m2, m3
+.if \i < 16
+ movdqu \i*4(DATA_PTR), \m0
+ pshufb SHUF_MASK, \m0
+.endif
+ movdqa (\i-32)*4(SHA256CONSTANTS), MSG
+ paddd \m0, MSG
+ sha256rnds2 STATE0, STATE1
+.if \i >= 12 && \i < 60
+ movdqa \m0, TMP
+ palignr $4, \m3, TMP
+ paddd TMP, \m1
+ sha256msg2 \m0, \m1
+.endif
+ punpckhqdq MSG, MSG
+ sha256rnds2 STATE1, STATE0
+.if \i >= 4 && \i < 52
+ sha256msg1 \m0, \m3
+.endif
+.endm
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 block function
+ *
+ * This function takes a pointer to the current SHA-256 state, a pointer to the
+ * input data, and the number of 64-byte blocks to process. Once all blocks
+ * have been processed, the state is updated with the new state. This function
+ * only processes complete blocks. State initialization, buffering of partial
+ * blocks, and digest finalization is expected to be handled elsewhere.
+ *
+ * void sha256_ni_transform(u32 state[SHA256_STATE_WORDS],
+ * const u8 *data, size_t nblocks);
+ */
+.text
+SYM_FUNC_START(sha256_ni_transform)
+ ANNOTATE_NOENDBR # since this is called only via static_call
+
+ shl $6, NUM_BLKS /* convert to bytes */
+ jz .Ldone_hash
+ add DATA_PTR, NUM_BLKS /* pointer to end of data */
+
+ /*
+ * load initial hash values
+ * Need to reorder these appropriately
+ * DCBA, HGFE -> ABEF, CDGH
+ */
+ movdqu 0*16(STATE_PTR), STATE0 /* DCBA */
+ movdqu 1*16(STATE_PTR), STATE1 /* HGFE */
+
+ movdqa STATE0, TMP
+ punpcklqdq STATE1, STATE0 /* FEBA */
+ punpckhqdq TMP, STATE1 /* DCHG */
+ pshufd $0x1B, STATE0, STATE0 /* ABEF */
+ pshufd $0xB1, STATE1, STATE1 /* CDGH */
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+ lea K256+32*4(%rip), SHA256CONSTANTS
+
+.Lloop0:
+ /* Save hash values for addition after rounds */
+ movdqa STATE0, ABEF_SAVE
+ movdqa STATE1, CDGH_SAVE
+
+.irp i, 0, 16, 32, 48
+ do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
+ do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
+ do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
+ do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
+.endr
+
+ /* Add current hash values with previously saved */
+ paddd ABEF_SAVE, STATE0
+ paddd CDGH_SAVE, STATE1
+
+ /* Increment data pointer and loop if more to process */
+ add $64, DATA_PTR
+ cmp NUM_BLKS, DATA_PTR
+ jne .Lloop0
+
+ /* Write hash values back in the correct order */
+ movdqa STATE0, TMP
+ punpcklqdq STATE1, STATE0 /* GHEF */
+ punpckhqdq TMP, STATE1 /* ABCD */
+ pshufd $0xB1, STATE0, STATE0 /* HGFE */
+ pshufd $0x1B, STATE1, STATE1 /* DCBA */
+
+ movdqu STATE1, 0*16(STATE_PTR)
+ movdqu STATE0, 1*16(STATE_PTR)
+
+.Ldone_hash:
+
+ RET
+SYM_FUNC_END(sha256_ni_transform)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
--- /dev/null
+########################################################################
+# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+
+## assume buffers not aligned
+#define MOVDQ movdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+ MOVDQ \p2, \p1
+ pshufb \p3, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+
+SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm12
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+
+SRND = %rsi # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP = _INP_END + _INP_END_SIZE
+_XFER = _INP + _INP_SIZE
+_XMM_SAVE = _XFER + _XFER_SIZE
+STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+ ## compute s0 four at a time and s1 two at a time
+ ## compute W[-16] + W[-7] 4 at a time
+ movdqa X3, XTMP0
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ palignr $4, X2, XTMP0 # XTMP0 = W[-7]
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ movdqa X1, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ## compute s0
+ palignr $4, X0, XTMP1 # XTMP1 = W[-15]
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
+ movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pslld $(32-7), XTMP1 #
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ psrld $7, XTMP2 #
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ #
+ ROTATE_ARGS #
+ movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ pslld $(32-18), XTMP3 #
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ psrld $18, XTMP2 #
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP3, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
+ add y0, y2 # y2 = S1 + CH
+ add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pxor XTMP4, XTMP1 # XTMP1 = s0
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ ## compute low s1
+ pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
+ xor g, y2 # y2 = f^g
+ psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP3, XTMP2
+ add y0, y2 # y2 = S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ ## compute high s1
+ pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ #
+ ROTATE_ARGS #
+ movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
+ and e, y2 # y2 = (f^g)&e
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ pxor XTMP3, XTMP2 #
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+ add y0, y2 # y2 = S1 + CH
+ add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ pxor XTMP2, X0 # X0 = s1 {xDxC}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and e, y2 # y2 = (f^g)&e
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ add y0, y2 # y2 = S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ offset = \round * 4 + _XFER
+ add offset(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS],
+## const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_ssse3)
+ ANNOTATE_NOENDBR # since this is called only via static_call
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ mov %rsp, %rbp
+
+ subq $STACK_SIZE, %rsp
+ and $~15, %rsp
+
+ shl $6, NUM_BLKS # convert to bytes
+ jz .Ldone_hash
+ add INP, NUM_BLKS
+ mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
+
+ ## load initial digest
+ mov 4*0(CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ movdqa _SHUF_00BA(%rip), SHUF_00BA
+ movdqa _SHUF_DC00(%rip), SHUF_DC00
+
+.Lloop0:
+ lea K256(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
+
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov $3, SRND
+.align 16
+.Lloop1:
+ movdqa (TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 1*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 2*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 3*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ add $4*16, TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ sub $1, SRND
+ jne .Lloop1
+
+ mov $2, SRND
+.Lloop2:
+ paddd (TBL), X0
+ movdqa X0, _XFER(%rsp)
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+ paddd 1*16(TBL), X1
+ movdqa X1, _XFER(%rsp)
+ add $2*16, TBL
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ movdqa X2, X0
+ movdqa X3, X1
+
+ sub $1, SRND
+ jne .Lloop2
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ mov _INP(%rsp), INP
+ add $64, INP
+ cmp _INP_END(%rsp), INP
+ jne .Lloop0
+
+.Ldone_hash:
+
+ mov %rbp, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+
+ RET
+SYM_FUNC_END(sha256_transform_ssse3)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 optimized for x86_64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <crypto/internal/sha2.h>
+#include <crypto/internal/simd.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/static_call.h>
+
+asmlinkage void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS],
+ const u8 *data, size_t nblocks);
+asmlinkage void sha256_transform_avx(u32 state[SHA256_STATE_WORDS],
+ const u8 *data, size_t nblocks);
+asmlinkage void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS],
+ const u8 *data, size_t nblocks);
+asmlinkage void sha256_ni_transform(u32 state[SHA256_STATE_WORDS],
+ const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86);
+
+DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3);
+
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_sha256_x86) && crypto_simd_usable()) {
+ kernel_fpu_begin();
+ static_call(sha256_blocks_x86)(state, data, nblocks);
+ kernel_fpu_end();
+ } else {
+ sha256_blocks_generic(state, data, nblocks);
+ }
+}
+EXPORT_SYMBOL(sha256_blocks_arch);
+
+bool sha256_is_arch_optimized(void)
+{
+ return static_key_enabled(&have_sha256_x86);
+}
+EXPORT_SYMBOL(sha256_is_arch_optimized);
+
+static int __init sha256_x86_mod_init(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+ static_call_update(sha256_blocks_x86, sha256_ni_transform);
+ } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE |
+ XFEATURE_MASK_YMM, NULL) &&
+ boot_cpu_has(X86_FEATURE_AVX)) {
+ if (boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ static_call_update(sha256_blocks_x86,
+ sha256_transform_rorx);
+ else
+ static_call_update(sha256_blocks_x86,
+ sha256_transform_avx);
+ } else if (!boot_cpu_has(X86_FEATURE_SSSE3)) {
+ return 0;
+ }
+ static_branch_enable(&have_sha256_x86);
+ return 0;
+}
+arch_initcall(sha256_x86_mod_init);
+
+static void __exit sha256_x86_mod_exit(void)
+{
+}
+module_exit(sha256_x86_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 optimized for x86_64");