- AVX2 (Advanced Vector Extensions 2)
- SHA-NI (SHA Extensions New Instructions)
-config CRYPTO_SHA512_SSSE3
- tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)"
- depends on 64BIT
- select CRYPTO_SHA512
- select CRYPTO_HASH
- help
- SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX (Advanced Vector Extensions)
- - AVX2 (Advanced Vector Extensions 2)
-
config CRYPTO_SM3_AVX_X86_64
tristate "Hash functions: SM3 (AVX)"
depends on 64BIT
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o
-obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
-sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
-
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
+++ /dev/null
-########################################################################
-# Implement fast SHA-512 with AVX instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# David Cote <david.m.cote@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-512 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-.text
-
-# Virtual Registers
-# ARG1
-digest = %rdi
-# ARG2
-msg = %rsi
-# ARG3
-msglen = %rdx
-T1 = %rcx
-T2 = %r8
-a_64 = %r9
-b_64 = %r10
-c_64 = %r11
-d_64 = %r12
-e_64 = %r13
-f_64 = %r14
-g_64 = %r15
-h_64 = %rbx
-tmp0 = %rax
-
-# Local variables (stack frame)
-
-# Message Schedule
-W_SIZE = 80*8
-# W[t] + K[t] | W[t+1] + K[t+1]
-WK_SIZE = 2*8
-
-frame_W = 0
-frame_WK = frame_W + W_SIZE
-frame_size = frame_WK + WK_SIZE
-
-# Useful QWORD "arrays" for simpler memory references
-# MSG, DIGEST, K_t, W_t are arrays
-# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
-
-# Input message (arg1)
-#define MSG(i) 8*i(msg)
-
-# Output Digest (arg2)
-#define DIGEST(i) 8*i(digest)
-
-# SHA Constants (static mem)
-#define K_t(i) 8*i+K512(%rip)
-
-# Message Schedule (stack frame)
-#define W_t(i) 8*i+frame_W(%rsp)
-
-# W[t]+K[t] (stack frame)
-#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
-
-.macro RotateState
- # Rotate symbols a..h right
- TMP = h_64
- h_64 = g_64
- g_64 = f_64
- f_64 = e_64
- e_64 = d_64
- d_64 = c_64
- c_64 = b_64
- b_64 = a_64
- a_64 = TMP
-.endm
-
-.macro RORQ p1 p2
- # shld is faster than ror on Sandybridge
- shld $(64-\p2), \p1, \p1
-.endm
-
-.macro SHA512_Round rnd
- # Compute Round %%t
- mov f_64, T1 # T1 = f
- mov e_64, tmp0 # tmp = e
- xor g_64, T1 # T1 = f ^ g
- RORQ tmp0, 23 # 41 # tmp = e ror 23
- and e_64, T1 # T1 = (f ^ g) & e
- xor e_64, tmp0 # tmp = (e ror 23) ^ e
- xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
- idx = \rnd
- add WK_2(idx), T1 # W[t] + K[t] from message scheduler
- RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4
- xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
- mov a_64, T2 # T2 = a
- add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
- RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
- add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
- mov a_64, tmp0 # tmp = a
- xor c_64, T2 # T2 = a ^ c
- and c_64, tmp0 # tmp = a & c
- and b_64, T2 # T2 = (a ^ c) & b
- xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
- mov a_64, tmp0 # tmp = a
- RORQ tmp0, 5 # 39 # tmp = a ror 5
- xor a_64, tmp0 # tmp = (a ror 5) ^ a
- add T1, d_64 # e(next_state) = d + T1
- RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6
- xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
- lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
- RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
- add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
- RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_avx rnd
- # Compute rounds t-2 and t-1
- # Compute message schedule QWORDS t and t+1
-
- # Two rounds are computed based on the values for K[t-2]+W[t-2] and
- # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
- # scheduler.
- # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
- # They are then added to their respective SHA512 constants at
- # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
- # For brievity, the comments following vectored instructions only refer to
- # the first of a pair of QWORDS.
- # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
- # The computation of the message schedule and the rounds are tightly
- # stitched to take advantage of instruction-level parallelism.
-
- idx = \rnd - 2
- vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2]
- idx = \rnd - 15
- vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
- mov f_64, T1
- vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61
- mov e_64, tmp0
- vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1
- xor g_64, T1
- RORQ tmp0, 23 # 41
- vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19
- and e_64, T1
- xor e_64, tmp0
- vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19
- xor g_64, T1
- idx = \rnd
- add WK_2(idx), T1#
- vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8
- RORQ tmp0, 4 # 18
- vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6
- xor e_64, tmp0
- mov a_64, T2
- add h_64, T1
- vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8
- RORQ tmp0, 14 # 14
- add tmp0, T1
- vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7
- mov a_64, tmp0
- xor c_64, T2
- vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3
- and c_64, tmp0
- and b_64, T2
- vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3
- xor tmp0, T2
- mov a_64, tmp0
- vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63
- RORQ tmp0, 5 # 39
- vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63
- xor a_64, tmp0
- add T1, d_64
- RORQ tmp0, 6 # 34
- xor a_64, tmp0
- vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
- # W[t-15]>>7 ^ W[t-15]<<63
- lea (T1, T2), h_64
- RORQ tmp0, 28 # 28
- vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25
- add tmp0, h_64
- RotateState
- vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
- # W[t-2]<<25
- mov f_64, T1
- vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2])
- mov e_64, tmp0
- xor g_64, T1
- idx = \rnd - 16
- vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16]
- idx = \rnd - 7
- vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
- RORQ tmp0, 23 # 41
- and e_64, T1
- xor e_64, tmp0
- xor g_64, T1
- vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56
- idx = \rnd + 1
- add WK_2(idx), T1
- vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15])
- RORQ tmp0, 4 # 18
- vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
- xor e_64, tmp0
- vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
- # s0(W[t-15]) + W[t-16]
- mov a_64, T2
- add h_64, T1
- RORQ tmp0, 14 # 14
- add tmp0, T1
- idx = \rnd
- vmovdqa %xmm0, W_t(idx) # Store W[t]
- vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t]
- vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
- mov a_64, tmp0
- xor c_64, T2
- and c_64, tmp0
- and b_64, T2
- xor tmp0, T2
- mov a_64, tmp0
- RORQ tmp0, 5 # 39
- xor a_64, tmp0
- add T1, d_64
- RORQ tmp0, 6 # 34
- xor a_64, tmp0
- lea (T1, T2), h_64
- RORQ tmp0, 28 # 28
- add tmp0, h_64
- RotateState
-.endm
-
-########################################################################
-# void sha512_transform_avx(sha512_state *state, const u8 *data, int blocks)
-# Purpose: Updates the SHA512 digest stored at "state" with the message
-# stored in "data".
-# The size of the message pointed to by "data" must be an integer multiple
-# of SHA512 message blocks.
-# "blocks" is the message length in SHA512 blocks
-########################################################################
-SYM_TYPED_FUNC_START(sha512_transform_avx)
- test msglen, msglen
- je .Lnowork
-
- # Save GPRs
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- # Allocate Stack Space
- push %rbp
- mov %rsp, %rbp
- sub $frame_size, %rsp
- and $~(0x20 - 1), %rsp
-
-.Lupdateblock:
-
- # Load state variables
- mov DIGEST(0), a_64
- mov DIGEST(1), b_64
- mov DIGEST(2), c_64
- mov DIGEST(3), d_64
- mov DIGEST(4), e_64
- mov DIGEST(5), f_64
- mov DIGEST(6), g_64
- mov DIGEST(7), h_64
-
- t = 0
- .rept 80/2 + 1
- # (80 rounds) / (2 rounds/iteration) + (1 iteration)
- # +1 iteration because the scheduler leads hashing by 1 iteration
- .if t < 2
- # BSWAP 2 QWORDS
- vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1
- vmovdqu MSG(t), %xmm0
- vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
- vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
- vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
- vmovdqa %xmm0, WK_2(t) # Store into WK for rounds
- .elseif t < 16
- # BSWAP 2 QWORDS# Compute 2 Rounds
- vmovdqu MSG(t), %xmm0
- vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
- SHA512_Round t-2 # Round t-2
- vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
- vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
- SHA512_Round t-1 # Round t-1
- vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK
- .elseif t < 79
- # Schedule 2 QWORDS# Compute 2 Rounds
- SHA512_2Sched_2Round_avx t
- .else
- # Compute 2 Rounds
- SHA512_Round t-2
- SHA512_Round t-1
- .endif
- t = t+2
- .endr
-
- # Update digest
- add a_64, DIGEST(0)
- add b_64, DIGEST(1)
- add c_64, DIGEST(2)
- add d_64, DIGEST(3)
- add e_64, DIGEST(4)
- add f_64, DIGEST(5)
- add g_64, DIGEST(6)
- add h_64, DIGEST(7)
-
- # Advance to next message block
- add $16*8, msg
- dec msglen
- jnz .Lupdateblock
-
- # Restore Stack Pointer
- mov %rbp, %rsp
- pop %rbp
-
- # Restore GPRs
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
-.Lnowork:
- RET
-SYM_FUNC_END(sha512_transform_avx)
-
-########################################################################
-### Binary Data
-
-.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
-.align 16
-# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
-XMM_QWORD_BSWAP:
- .octa 0x08090a0b0c0d0e0f0001020304050607
-
-# Mergeable 640-byte rodata section. This allows linker to merge the table
-# with other, exactly the same 640-byte fragment of another rodata section
-# (if such section exists).
-.section .rodata.cst640.K512, "aM", @progbits, 640
-.align 64
-# K[t] used in SHA512 hashing
-K512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+++ /dev/null
-########################################################################
-# Implement fast SHA-512 with AVX2 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# David Cote <david.m.cote@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-512 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 1 blocks at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-.text
-
-# Virtual Registers
-Y_0 = %ymm4
-Y_1 = %ymm5
-Y_2 = %ymm6
-Y_3 = %ymm7
-
-YTMP0 = %ymm0
-YTMP1 = %ymm1
-YTMP2 = %ymm2
-YTMP3 = %ymm3
-YTMP4 = %ymm8
-XFER = YTMP0
-
-BYTE_FLIP_MASK = %ymm9
-
-# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
-CTX1 = %rdi
-CTX2 = %r12
-# 2nd arg
-INP = %rsi
-# 3rd arg
-NUM_BLKS = %rdx
-
-c = %rcx
-d = %r8
-e = %rdx
-y3 = %rsi
-
-TBL = %rdi # clobbers CTX1
-
-a = %rax
-b = %rbx
-
-f = %r9
-g = %r10
-h = %r11
-old_h = %r11
-
-T1 = %r12 # clobbers CTX2
-y0 = %r13
-y1 = %r14
-y2 = %r15
-
-# Local variables (stack frame)
-XFER_SIZE = 4*8
-SRND_SIZE = 1*8
-INP_SIZE = 1*8
-INPEND_SIZE = 1*8
-CTX_SIZE = 1*8
-
-frame_XFER = 0
-frame_SRND = frame_XFER + XFER_SIZE
-frame_INP = frame_SRND + SRND_SIZE
-frame_INPEND = frame_INP + INP_SIZE
-frame_CTX = frame_INPEND + INPEND_SIZE
-frame_size = frame_CTX + CTX_SIZE
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-
-# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
-# Load ymm with mem and byte swap each dword
-.macro COPY_YMM_AND_BSWAP p1 p2 p3
- VMOVDQ \p2, \p1
- vpshufb \p3, \p1, \p1
-.endm
-# rotate_Ys
-# Rotate values of symbols Y0...Y3
-.macro rotate_Ys
- Y_ = Y_0
- Y_0 = Y_1
- Y_1 = Y_2
- Y_2 = Y_3
- Y_3 = Y_
-.endm
-
-# RotateState
-.macro RotateState
- # Rotate symbols a..h right
- old_h = h
- TMP_ = h
- h = g
- g = f
- f = e
- e = d
- d = c
- c = b
- b = a
- a = TMP_
-.endm
-
-# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
-# YDST = {YSRC1, YSRC2} >> RVAL*8
-.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
- vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
- vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
-################################### RND N + 0 #########################################
-
- # Extract w[t-7]
- MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
- # Calculate w[t-16] + w[t-7]
- vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
- # Extract w[t-15]
- MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
-
- # Calculate sigma0
-
- # Calculate w[t-15] ror 1
- vpsrlq $1, YTMP1, YTMP2
- vpsllq $(64-1), YTMP1, YTMP3
- vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
- # Calculate w[t-15] shr 7
- vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- add frame_XFER(%rsp),h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
- mov f, y2 # y2 = f # CH
- rorx $34, a, T1 # T1 = a >> 34 # S0B
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- xor g, y2 # y2 = f^g # CH
- rorx $14, e, y1 # y1 = (e >> 14) # S1
-
- and e, y2 # y2 = (f^g)&e # CH
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- add h, d # d = k + w + h + d # --
-
- and b, y3 # y3 = (a|c)&b # MAJA
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
-
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
-
- add y0, y2 # y2 = S1 + CH # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-################################### RND N + 1 #########################################
-
- # Calculate w[t-15] ror 8
- vpsrlq $8, YTMP1, YTMP2
- vpsllq $(64-8), YTMP1, YTMP1
- vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
- # XOR the three components
- vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
- vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
-
-
- # Add three components, w[t-16], w[t-7] and sigma0
- vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
- # Move to appropriate lanes for calculating w[16] and w[17]
- vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
- # Move to appropriate lanes for calculating w[18] and w[19]
- vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
-
- # Calculate w[16] and w[17] in both 128 bit lanes
-
- # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
- vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
- vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
-
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- mov f, y2 # y2 = f # CH
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
-
- and b, y3 # y3 = (a|c)&b # MAJA
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
-
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-
-################################### RND N + 2 #########################################
-
- vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
- vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
- vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
- vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
- # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
-
- # Add sigma1 to the other compunents to get w[16] and w[17]
- vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
-
- # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
- vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
-
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- or c, y3 # y3 = a|c # MAJA
- mov f, y2 # y2 = f # CH
- xor g, y2 # y2 = f^g # CH
-
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- and e, y2 # y2 = (f^g)&e # CH
-
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-################################### RND N + 3 #########################################
-
- vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
- vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
- vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
- vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
- # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
-
- # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
- # to newly calculated sigma1 to get w[18] and w[19]
- vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
-
- # Form w[19, w[18], w17], w[16]
- vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- mov f, y2 # y2 = f # CH
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- add y0, y2 # y2 = S1 + CH # --
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- rorx $28, a, T1 # T1 = (a >> 28) # S0
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
-
- add y1, h # h = k + w + h + S0 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
- rotate_Ys
-.endm
-
-.macro DO_4ROUNDS
-
-################################### RND N + 0 #########################################
-
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- RotateState
-
-################################### RND N + 1 #########################################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- RotateState
-
-################################### RND N + 2 #########################################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- RotateState
-
-################################### RND N + 3 #########################################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-.endm
-
-########################################################################
-# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
-# Purpose: Updates the SHA512 digest stored at "state" with the message
-# stored in "data".
-# The size of the message pointed to by "data" must be an integer multiple
-# of SHA512 message blocks.
-# "blocks" is the message length in SHA512 blocks
-########################################################################
-SYM_TYPED_FUNC_START(sha512_transform_rorx)
- # Save GPRs
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- # Allocate Stack Space
- push %rbp
- mov %rsp, %rbp
- sub $frame_size, %rsp
- and $~(0x20 - 1), %rsp
-
- shl $7, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS # pointer to end of data
- mov NUM_BLKS, frame_INPEND(%rsp)
-
- ## load initial digest
- mov 8*0(CTX1), a
- mov 8*1(CTX1), b
- mov 8*2(CTX1), c
- mov 8*3(CTX1), d
- mov 8*4(CTX1), e
- mov 8*5(CTX1), f
- mov 8*6(CTX1), g
- mov 8*7(CTX1), h
-
- # save %rdi (CTX) before it gets clobbered
- mov %rdi, frame_CTX(%rsp)
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
-
-.Lloop0:
- lea K512(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
-
- mov INP, frame_INP(%rsp)
-
- ## schedule 64 input dwords, by doing 12 rounds of 4 each
- movq $4, frame_SRND(%rsp)
-
-.align 16
-.Lloop1:
- vpaddq (TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddq 1*32(TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddq 2*32(TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddq 3*32(TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- add $(4*32), TBL
- FOUR_ROUNDS_AND_SCHED
-
- subq $1, frame_SRND(%rsp)
- jne .Lloop1
-
- movq $2, frame_SRND(%rsp)
-.Lloop2:
- vpaddq (TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- DO_4ROUNDS
- vpaddq 1*32(TBL), Y_1, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- add $(2*32), TBL
- DO_4ROUNDS
-
- vmovdqa Y_2, Y_0
- vmovdqa Y_3, Y_1
-
- subq $1, frame_SRND(%rsp)
- jne .Lloop2
-
- mov frame_CTX(%rsp), CTX2
- addm 8*0(CTX2), a
- addm 8*1(CTX2), b
- addm 8*2(CTX2), c
- addm 8*3(CTX2), d
- addm 8*4(CTX2), e
- addm 8*5(CTX2), f
- addm 8*6(CTX2), g
- addm 8*7(CTX2), h
-
- mov frame_INP(%rsp), INP
- add $128, INP
- cmp frame_INPEND(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- # Restore Stack Pointer
- mov %rbp, %rsp
- pop %rbp
-
- # Restore GPRs
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
- vzeroupper
- RET
-SYM_FUNC_END(sha512_transform_rorx)
-
-########################################################################
-### Binary Data
-
-
-# Mergeable 640-byte rodata section. This allows linker to merge the table
-# with other, exactly the same 640-byte fragment of another rodata section
-# (if such section exists).
-.section .rodata.cst640.K512, "aM", @progbits, 640
-.align 64
-# K[t] used in SHA512 hashing
-K512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
-
-.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
-.align 32
-# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x08090a0b0c0d0e0f0001020304050607
- .octa 0x18191a1b1c1d1e1f1011121314151617
-
-.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
-.align 32
-MASK_YMM_LO:
- .octa 0x00000000000000000000000000000000
- .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+++ /dev/null
-########################################################################
-# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# David Cote <david.m.cote@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-512 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-.text
-
-# Virtual Registers
-# ARG1
-digest = %rdi
-# ARG2
-msg = %rsi
-# ARG3
-msglen = %rdx
-T1 = %rcx
-T2 = %r8
-a_64 = %r9
-b_64 = %r10
-c_64 = %r11
-d_64 = %r12
-e_64 = %r13
-f_64 = %r14
-g_64 = %r15
-h_64 = %rbx
-tmp0 = %rax
-
-# Local variables (stack frame)
-
-W_SIZE = 80*8
-WK_SIZE = 2*8
-
-frame_W = 0
-frame_WK = frame_W + W_SIZE
-frame_size = frame_WK + WK_SIZE
-
-# Useful QWORD "arrays" for simpler memory references
-# MSG, DIGEST, K_t, W_t are arrays
-# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
-
-# Input message (arg1)
-#define MSG(i) 8*i(msg)
-
-# Output Digest (arg2)
-#define DIGEST(i) 8*i(digest)
-
-# SHA Constants (static mem)
-#define K_t(i) 8*i+K512(%rip)
-
-# Message Schedule (stack frame)
-#define W_t(i) 8*i+frame_W(%rsp)
-
-# W[t]+K[t] (stack frame)
-#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
-
-.macro RotateState
- # Rotate symbols a..h right
- TMP = h_64
- h_64 = g_64
- g_64 = f_64
- f_64 = e_64
- e_64 = d_64
- d_64 = c_64
- c_64 = b_64
- b_64 = a_64
- a_64 = TMP
-.endm
-
-.macro SHA512_Round rnd
-
- # Compute Round %%t
- mov f_64, T1 # T1 = f
- mov e_64, tmp0 # tmp = e
- xor g_64, T1 # T1 = f ^ g
- ror $23, tmp0 # 41 # tmp = e ror 23
- and e_64, T1 # T1 = (f ^ g) & e
- xor e_64, tmp0 # tmp = (e ror 23) ^ e
- xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
- idx = \rnd
- add WK_2(idx), T1 # W[t] + K[t] from message scheduler
- ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4
- xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
- mov a_64, T2 # T2 = a
- add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
- ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
- add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
- mov a_64, tmp0 # tmp = a
- xor c_64, T2 # T2 = a ^ c
- and c_64, tmp0 # tmp = a & c
- and b_64, T2 # T2 = (a ^ c) & b
- xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
- mov a_64, tmp0 # tmp = a
- ror $5, tmp0 # 39 # tmp = a ror 5
- xor a_64, tmp0 # tmp = (a ror 5) ^ a
- add T1, d_64 # e(next_state) = d + T1
- ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6
- xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
- lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
- ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
- add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
- RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_sse rnd
-
- # Compute rounds t-2 and t-1
- # Compute message schedule QWORDS t and t+1
-
- # Two rounds are computed based on the values for K[t-2]+W[t-2] and
- # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
- # scheduler.
- # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
- # They are then added to their respective SHA512 constants at
- # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
- # For brievity, the comments following vectored instructions only refer to
- # the first of a pair of QWORDS.
- # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
- # The computation of the message schedule and the rounds are tightly
- # stitched to take advantage of instruction-level parallelism.
- # For clarity, integer instructions (for the rounds calculation) are indented
- # by one tab. Vectored instructions (for the message scheduler) are indented
- # by two tabs.
-
- mov f_64, T1
- idx = \rnd -2
- movdqa W_t(idx), %xmm2 # XMM2 = W[t-2]
- xor g_64, T1
- and e_64, T1
- movdqa %xmm2, %xmm0 # XMM0 = W[t-2]
- xor g_64, T1
- idx = \rnd
- add WK_2(idx), T1
- idx = \rnd - 15
- movdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
- mov e_64, tmp0
- ror $23, tmp0 # 41
- movdqa %xmm5, %xmm3 # XMM3 = W[t-15]
- xor e_64, tmp0
- ror $4, tmp0 # 18
- psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42
- xor e_64, tmp0
- ror $14, tmp0 # 14
- psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1
- add tmp0, T1
- add h_64, T1
- pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2]
- mov a_64, T2
- xor c_64, T2
- pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15]
- and b_64, T2
- mov a_64, tmp0
- psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
- and c_64, tmp0
- xor tmp0, T2
- psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
- mov a_64, tmp0
- ror $5, tmp0 # 39
- pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
- xor a_64, tmp0
- ror $6, tmp0 # 34
- pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
- xor a_64, tmp0
- ror $28, tmp0 # 28
- psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
- add tmp0, T2
- add T1, d_64
- psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
- lea (T1, T2), h_64
- RotateState
- movdqa %xmm2, %xmm1 # XMM1 = W[t-2]
- mov f_64, T1
- xor g_64, T1
- movdqa %xmm5, %xmm4 # XMM4 = W[t-15]
- and e_64, T1
- xor g_64, T1
- psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42
- idx = \rnd + 1
- add WK_2(idx), T1
- mov e_64, tmp0
- psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7
- ror $23, tmp0 # 41
- xor e_64, tmp0
- pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2]
- ror $4, tmp0 # 18
- xor e_64, tmp0
- pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15]
- ror $14, tmp0 # 14
- add tmp0, T1
- psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
- add h_64, T1
- mov a_64, T2
- psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
- xor c_64, T2
- and b_64, T2
- pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2])
- mov a_64, tmp0
- and c_64, tmp0
- idx = \rnd - 7
- movdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
- xor tmp0, T2
- pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15])
- mov a_64, tmp0
- paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15])
- ror $5, tmp0 # 39
- idx =\rnd-16
- paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
- xor a_64, tmp0
- paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
- ror $6, tmp0 # 34
- movdqa %xmm0, W_t(\rnd) # Store scheduled qwords
- xor a_64, tmp0
- paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t]
- ror $28, tmp0 # 28
- idx = \rnd
- movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
- add tmp0, T2
- add T1, d_64
- lea (T1, T2), h_64
- RotateState
-.endm
-
-########################################################################
-## void sha512_transform_ssse3(struct sha512_state *state, const u8 *data,
-## int blocks);
-# (struct sha512_state is assumed to begin with u64 state[8])
-# Purpose: Updates the SHA512 digest stored at "state" with the message
-# stored in "data".
-# The size of the message pointed to by "data" must be an integer multiple
-# of SHA512 message blocks.
-# "blocks" is the message length in SHA512 blocks.
-########################################################################
-SYM_TYPED_FUNC_START(sha512_transform_ssse3)
-
- test msglen, msglen
- je .Lnowork
-
- # Save GPRs
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- # Allocate Stack Space
- push %rbp
- mov %rsp, %rbp
- sub $frame_size, %rsp
- and $~(0x20 - 1), %rsp
-
-.Lupdateblock:
-
-# Load state variables
- mov DIGEST(0), a_64
- mov DIGEST(1), b_64
- mov DIGEST(2), c_64
- mov DIGEST(3), d_64
- mov DIGEST(4), e_64
- mov DIGEST(5), f_64
- mov DIGEST(6), g_64
- mov DIGEST(7), h_64
-
- t = 0
- .rept 80/2 + 1
- # (80 rounds) / (2 rounds/iteration) + (1 iteration)
- # +1 iteration because the scheduler leads hashing by 1 iteration
- .if t < 2
- # BSWAP 2 QWORDS
- movdqa XMM_QWORD_BSWAP(%rip), %xmm1
- movdqu MSG(t), %xmm0
- pshufb %xmm1, %xmm0 # BSWAP
- movdqa %xmm0, W_t(t) # Store Scheduled Pair
- paddq K_t(t), %xmm0 # Compute W[t]+K[t]
- movdqa %xmm0, WK_2(t) # Store into WK for rounds
- .elseif t < 16
- # BSWAP 2 QWORDS# Compute 2 Rounds
- movdqu MSG(t), %xmm0
- pshufb %xmm1, %xmm0 # BSWAP
- SHA512_Round t-2 # Round t-2
- movdqa %xmm0, W_t(t) # Store Scheduled Pair
- paddq K_t(t), %xmm0 # Compute W[t]+K[t]
- SHA512_Round t-1 # Round t-1
- movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK
- .elseif t < 79
- # Schedule 2 QWORDS# Compute 2 Rounds
- SHA512_2Sched_2Round_sse t
- .else
- # Compute 2 Rounds
- SHA512_Round t-2
- SHA512_Round t-1
- .endif
- t = t+2
- .endr
-
- # Update digest
- add a_64, DIGEST(0)
- add b_64, DIGEST(1)
- add c_64, DIGEST(2)
- add d_64, DIGEST(3)
- add e_64, DIGEST(4)
- add f_64, DIGEST(5)
- add g_64, DIGEST(6)
- add h_64, DIGEST(7)
-
- # Advance to next message block
- add $16*8, msg
- dec msglen
- jnz .Lupdateblock
-
- # Restore Stack Pointer
- mov %rbp, %rsp
- pop %rbp
-
- # Restore GPRs
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
-.Lnowork:
- RET
-SYM_FUNC_END(sha512_transform_ssse3)
-
-########################################################################
-### Binary Data
-
-.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
-.align 16
-# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
-XMM_QWORD_BSWAP:
- .octa 0x08090a0b0c0d0e0f0001020304050607
-
-# Mergeable 640-byte rodata section. This allows linker to merge the table
-# with other, exactly the same 640-byte fragment of another rodata section
-# (if such section exists).
-.section .rodata.cst640.K512, "aM", @progbits, 640
-.align 64
-# K[t] used in SHA512 hashing
-K512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+++ /dev/null
-/*
- * Cryptographic API.
- *
- * Glue code for the SHA512 Secure Hash Algorithm assembler
- * implementation using supplemental SSE3 / AVX / AVX2 instructions.
- *
- * This file is based on sha512_generic.c
- *
- * Copyright (C) 2013 Intel Corporation
- * Author: Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-
-asmlinkage void sha512_transform_ssse3(struct sha512_state *state,
- const u8 *data, int blocks);
-
-static int sha512_update_x86(struct shash_desc *desc, const u8 *data,
- unsigned int len, sha512_block_fn *sha512_xform)
-{
- int remain;
-
- /*
- * Make sure struct sha512_state begins directly with the SHA512
- * 512-bit internal state, as this is what the asm functions expect.
- */
- BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
-
- kernel_fpu_begin();
- remain = sha512_base_do_update_blocks(desc, data, len, sha512_xform);
- kernel_fpu_end();
-
- return remain;
-}
-
-static int sha512_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out, sha512_block_fn *sha512_xform)
-{
- kernel_fpu_begin();
- sha512_base_do_finup(desc, data, len, sha512_xform);
- kernel_fpu_end();
-
- return sha512_base_finish(desc, out);
-}
-
-static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_update_x86(desc, data, len, sha512_transform_ssse3);
-}
-
-static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha512_finup(desc, data, len, out, sha512_transform_ssse3);
-}
-
-static struct shash_alg sha512_ssse3_algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_ssse3_update,
- .finup = sha512_ssse3_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-ssse3",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_ssse3_update,
- .finup = sha512_ssse3_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-ssse3",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int register_sha512_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- return crypto_register_shashes(sha512_ssse3_algs,
- ARRAY_SIZE(sha512_ssse3_algs));
- return 0;
-}
-
-static void unregister_sha512_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shashes(sha512_ssse3_algs,
- ARRAY_SIZE(sha512_ssse3_algs));
-}
-
-asmlinkage void sha512_transform_avx(struct sha512_state *state,
- const u8 *data, int blocks);
-static bool avx_usable(void)
-{
- if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
- if (boot_cpu_has(X86_FEATURE_AVX))
- pr_info("AVX detected but unusable.\n");
- return false;
- }
-
- return true;
-}
-
-static int sha512_avx_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_update_x86(desc, data, len, sha512_transform_avx);
-}
-
-static int sha512_avx_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha512_finup(desc, data, len, out, sha512_transform_avx);
-}
-
-static struct shash_alg sha512_avx_algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_avx_update,
- .finup = sha512_avx_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-avx",
- .cra_priority = 160,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_avx_update,
- .finup = sha512_avx_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-avx",
- .cra_priority = 160,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int register_sha512_avx(void)
-{
- if (avx_usable())
- return crypto_register_shashes(sha512_avx_algs,
- ARRAY_SIZE(sha512_avx_algs));
- return 0;
-}
-
-static void unregister_sha512_avx(void)
-{
- if (avx_usable())
- crypto_unregister_shashes(sha512_avx_algs,
- ARRAY_SIZE(sha512_avx_algs));
-}
-
-asmlinkage void sha512_transform_rorx(struct sha512_state *state,
- const u8 *data, int blocks);
-
-static int sha512_avx2_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_update_x86(desc, data, len, sha512_transform_rorx);
-}
-
-static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha512_finup(desc, data, len, out, sha512_transform_rorx);
-}
-
-static struct shash_alg sha512_avx2_algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_avx2_update,
- .finup = sha512_avx2_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-avx2",
- .cra_priority = 170,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_avx2_update,
- .finup = sha512_avx2_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-avx2",
- .cra_priority = 170,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static bool avx2_usable(void)
-{
- if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_BMI2))
- return true;
-
- return false;
-}
-
-static int register_sha512_avx2(void)
-{
- if (avx2_usable())
- return crypto_register_shashes(sha512_avx2_algs,
- ARRAY_SIZE(sha512_avx2_algs));
- return 0;
-}
-static const struct x86_cpu_id module_cpu_ids[] = {
- X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
-
-static void unregister_sha512_avx2(void)
-{
- if (avx2_usable())
- crypto_unregister_shashes(sha512_avx2_algs,
- ARRAY_SIZE(sha512_avx2_algs));
-}
-
-static int __init sha512_ssse3_mod_init(void)
-{
- if (!x86_match_cpu(module_cpu_ids))
- return -ENODEV;
-
- if (register_sha512_ssse3())
- goto fail;
-
- if (register_sha512_avx()) {
- unregister_sha512_ssse3();
- goto fail;
- }
-
- if (register_sha512_avx2()) {
- unregister_sha512_avx();
- unregister_sha512_ssse3();
- goto fail;
- }
-
- return 0;
-fail:
- return -ENODEV;
-}
-
-static void __exit sha512_ssse3_mod_fini(void)
-{
- unregister_sha512_avx2();
- unregister_sha512_avx();
- unregister_sha512_ssse3();
-}
-
-module_init(sha512_ssse3_mod_init);
-module_exit(sha512_ssse3_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-
-MODULE_ALIAS_CRYPTO("sha512");
-MODULE_ALIAS_CRYPTO("sha512-ssse3");
-MODULE_ALIAS_CRYPTO("sha512-avx");
-MODULE_ALIAS_CRYPTO("sha512-avx2");
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha384-ssse3");
-MODULE_ALIAS_CRYPTO("sha384-avx");
-MODULE_ALIAS_CRYPTO("sha384-avx2");
default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
default y if S390
default y if SPARC64
+ default y if X86_64
config CRYPTO_LIB_SM3
tristate
libsha512-$(CONFIG_RISCV) += riscv/sha512-riscv64-zvknhb-zvkb.o
libsha512-$(CONFIG_SPARC) += sparc/sha512_asm.o
+libsha512-$(CONFIG_X86) += x86/sha512-ssse3-asm.o \
+ x86/sha512-avx-asm.o \
+ x86/sha512-avx2-asm.o
endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
obj-$(CONFIG_MPILIB) += mpi/
--- /dev/null
+########################################################################
+# Implement fast SHA-512 with AVX instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest = %rdi
+# ARG2
+msg = %rsi
+# ARG3
+msglen = %rdx
+T1 = %rcx
+T2 = %r8
+a_64 = %r9
+b_64 = %r10
+c_64 = %r11
+d_64 = %r12
+e_64 = %r13
+f_64 = %r14
+g_64 = %r15
+h_64 = %rbx
+tmp0 = %rax
+
+# Local variables (stack frame)
+
+# Message Schedule
+W_SIZE = 80*8
+# W[t] + K[t] | W[t+1] + K[t+1]
+WK_SIZE = 2*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_size = frame_WK + WK_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i) 8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i) 8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i) 8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+ # Rotate symbols a..h right
+ TMP = h_64
+ h_64 = g_64
+ g_64 = f_64
+ f_64 = e_64
+ e_64 = d_64
+ d_64 = c_64
+ c_64 = b_64
+ b_64 = a_64
+ a_64 = TMP
+.endm
+
+.macro RORQ p1 p2
+ # shld is faster than ror on Sandybridge
+ shld $(64-\p2), \p1, \p1
+.endm
+
+.macro SHA512_Round rnd
+ # Compute Round %%t
+ mov f_64, T1 # T1 = f
+ mov e_64, tmp0 # tmp = e
+ xor g_64, T1 # T1 = f ^ g
+ RORQ tmp0, 23 # 41 # tmp = e ror 23
+ and e_64, T1 # T1 = (f ^ g) & e
+ xor e_64, tmp0 # tmp = (e ror 23) ^ e
+ xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ idx = \rnd
+ add WK_2(idx), T1 # W[t] + K[t] from message scheduler
+ RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4
+ xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov a_64, T2 # T2 = a
+ add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
+ RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov a_64, tmp0 # tmp = a
+ xor c_64, T2 # T2 = a ^ c
+ and c_64, tmp0 # tmp = a & c
+ and b_64, T2 # T2 = (a ^ c) & b
+ xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov a_64, tmp0 # tmp = a
+ RORQ tmp0, 5 # 39 # tmp = a ror 5
+ xor a_64, tmp0 # tmp = (a ror 5) ^ a
+ add T1, d_64 # e(next_state) = d + T1
+ RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6
+ xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
+ RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_avx rnd
+ # Compute rounds t-2 and t-1
+ # Compute message schedule QWORDS t and t+1
+
+ # Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ # scheduler.
+ # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
+ # They are then added to their respective SHA512 constants at
+ # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
+ # For brievity, the comments following vectored instructions only refer to
+ # the first of a pair of QWORDS.
+ # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
+ # The computation of the message schedule and the rounds are tightly
+ # stitched to take advantage of instruction-level parallelism.
+
+ idx = \rnd - 2
+ vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2]
+ idx = \rnd - 15
+ vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
+ mov f_64, T1
+ vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61
+ mov e_64, tmp0
+ vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1
+ xor g_64, T1
+ RORQ tmp0, 23 # 41
+ vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19
+ and e_64, T1
+ xor e_64, tmp0
+ vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19
+ xor g_64, T1
+ idx = \rnd
+ add WK_2(idx), T1#
+ vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8
+ RORQ tmp0, 4 # 18
+ vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6
+ xor e_64, tmp0
+ mov a_64, T2
+ add h_64, T1
+ vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8
+ RORQ tmp0, 14 # 14
+ add tmp0, T1
+ vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7
+ mov a_64, tmp0
+ xor c_64, T2
+ vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3
+ and c_64, tmp0
+ and b_64, T2
+ vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3
+ xor tmp0, T2
+ mov a_64, tmp0
+ vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63
+ RORQ tmp0, 5 # 39
+ vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63
+ xor a_64, tmp0
+ add T1, d_64
+ RORQ tmp0, 6 # 34
+ xor a_64, tmp0
+ vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
+ # W[t-15]>>7 ^ W[t-15]<<63
+ lea (T1, T2), h_64
+ RORQ tmp0, 28 # 28
+ vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25
+ add tmp0, h_64
+ RotateState
+ vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
+ # W[t-2]<<25
+ mov f_64, T1
+ vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2])
+ mov e_64, tmp0
+ xor g_64, T1
+ idx = \rnd - 16
+ vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16]
+ idx = \rnd - 7
+ vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
+ RORQ tmp0, 23 # 41
+ and e_64, T1
+ xor e_64, tmp0
+ xor g_64, T1
+ vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56
+ idx = \rnd + 1
+ add WK_2(idx), T1
+ vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15])
+ RORQ tmp0, 4 # 18
+ vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
+ xor e_64, tmp0
+ vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
+ # s0(W[t-15]) + W[t-16]
+ mov a_64, T2
+ add h_64, T1
+ RORQ tmp0, 14 # 14
+ add tmp0, T1
+ idx = \rnd
+ vmovdqa %xmm0, W_t(idx) # Store W[t]
+ vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t]
+ vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
+ mov a_64, tmp0
+ xor c_64, T2
+ and c_64, tmp0
+ and b_64, T2
+ xor tmp0, T2
+ mov a_64, tmp0
+ RORQ tmp0, 5 # 39
+ xor a_64, tmp0
+ add T1, d_64
+ RORQ tmp0, 6 # 34
+ xor a_64, tmp0
+ lea (T1, T2), h_64
+ RORQ tmp0, 28 # 28
+ add tmp0, h_64
+ RotateState
+.endm
+
+########################################################################
+# void sha512_transform_avx(struct sha512_block_state *state,
+# const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks
+########################################################################
+SYM_FUNC_START(sha512_transform_avx)
+
+ test msglen, msglen
+ je .Lnowork
+
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ # Allocate Stack Space
+ push %rbp
+ mov %rsp, %rbp
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+
+.Lupdateblock:
+
+ # Load state variables
+ mov DIGEST(0), a_64
+ mov DIGEST(1), b_64
+ mov DIGEST(2), c_64
+ mov DIGEST(3), d_64
+ mov DIGEST(4), e_64
+ mov DIGEST(5), f_64
+ mov DIGEST(6), g_64
+ mov DIGEST(7), h_64
+
+ t = 0
+ .rept 80/2 + 1
+ # (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ # +1 iteration because the scheduler leads hashing by 1 iteration
+ .if t < 2
+ # BSWAP 2 QWORDS
+ vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1
+ vmovdqu MSG(t), %xmm0
+ vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
+ vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
+ vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+ vmovdqa %xmm0, WK_2(t) # Store into WK for rounds
+ .elseif t < 16
+ # BSWAP 2 QWORDS# Compute 2 Rounds
+ vmovdqu MSG(t), %xmm0
+ vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
+ SHA512_Round t-2 # Round t-2
+ vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
+ vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+ SHA512_Round t-1 # Round t-1
+ vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK
+ .elseif t < 79
+ # Schedule 2 QWORDS# Compute 2 Rounds
+ SHA512_2Sched_2Round_avx t
+ .else
+ # Compute 2 Rounds
+ SHA512_Round t-2
+ SHA512_Round t-1
+ .endif
+ t = t+2
+ .endr
+
+ # Update digest
+ add a_64, DIGEST(0)
+ add b_64, DIGEST(1)
+ add c_64, DIGEST(2)
+ add d_64, DIGEST(3)
+ add e_64, DIGEST(4)
+ add f_64, DIGEST(5)
+ add g_64, DIGEST(6)
+ add h_64, DIGEST(7)
+
+ # Advance to next message block
+ add $16*8, msg
+ dec msglen
+ jnz .Lupdateblock
+
+ # Restore Stack Pointer
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+.Lnowork:
+ RET
+SYM_FUNC_END(sha512_transform_avx)
+
+########################################################################
+### Binary Data
+
+.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
--- /dev/null
+########################################################################
+# Implement fast SHA-512 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 blocks at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+Y_0 = %ymm4
+Y_1 = %ymm5
+Y_2 = %ymm6
+Y_3 = %ymm7
+
+YTMP0 = %ymm0
+YTMP1 = %ymm1
+YTMP2 = %ymm2
+YTMP3 = %ymm3
+YTMP4 = %ymm8
+XFER = YTMP0
+
+BYTE_FLIP_MASK = %ymm9
+
+# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
+CTX1 = %rdi
+CTX2 = %r12
+# 2nd arg
+INP = %rsi
+# 3rd arg
+NUM_BLKS = %rdx
+
+c = %rcx
+d = %r8
+e = %rdx
+y3 = %rsi
+
+TBL = %rdi # clobbers CTX1
+
+a = %rax
+b = %rbx
+
+f = %r9
+g = %r10
+h = %r11
+old_h = %r11
+
+T1 = %r12 # clobbers CTX2
+y0 = %r13
+y1 = %r14
+y2 = %r15
+
+# Local variables (stack frame)
+XFER_SIZE = 4*8
+SRND_SIZE = 1*8
+INP_SIZE = 1*8
+INPEND_SIZE = 1*8
+CTX_SIZE = 1*8
+
+frame_XFER = 0
+frame_SRND = frame_XFER + XFER_SIZE
+frame_INP = frame_SRND + SRND_SIZE
+frame_INPEND = frame_INP + INP_SIZE
+frame_CTX = frame_INPEND + INPEND_SIZE
+frame_size = frame_CTX + CTX_SIZE
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+
+# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
+# Load ymm with mem and byte swap each dword
+.macro COPY_YMM_AND_BSWAP p1 p2 p3
+ VMOVDQ \p2, \p1
+ vpshufb \p3, \p1, \p1
+.endm
+# rotate_Ys
+# Rotate values of symbols Y0...Y3
+.macro rotate_Ys
+ Y_ = Y_0
+ Y_0 = Y_1
+ Y_1 = Y_2
+ Y_2 = Y_3
+ Y_3 = Y_
+.endm
+
+# RotateState
+.macro RotateState
+ # Rotate symbols a..h right
+ old_h = h
+ TMP_ = h
+ h = g
+ g = f
+ f = e
+ e = d
+ d = c
+ c = b
+ b = a
+ a = TMP_
+.endm
+
+# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
+# YDST = {YSRC1, YSRC2} >> RVAL*8
+.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
+ vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
+ vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+################################### RND N + 0 #########################################
+
+ # Extract w[t-7]
+ MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
+ # Calculate w[t-16] + w[t-7]
+ vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
+ # Extract w[t-15]
+ MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
+
+ # Calculate sigma0
+
+ # Calculate w[t-15] ror 1
+ vpsrlq $1, YTMP1, YTMP2
+ vpsllq $(64-1), YTMP1, YTMP3
+ vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
+ # Calculate w[t-15] shr 7
+ vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add frame_XFER(%rsp),h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+
+ and e, y2 # y2 = (f^g)&e # CH
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+
+ add y0, y2 # y2 = S1 + CH # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+################################### RND N + 1 #########################################
+
+ # Calculate w[t-15] ror 8
+ vpsrlq $8, YTMP1, YTMP2
+ vpsllq $(64-8), YTMP1, YTMP1
+ vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
+ # XOR the three components
+ vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
+ vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
+
+
+ # Add three components, w[t-16], w[t-7] and sigma0
+ vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
+ # Move to appropriate lanes for calculating w[16] and w[17]
+ vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
+ # Move to appropriate lanes for calculating w[18] and w[19]
+ vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
+
+ # Calculate w[16] and w[17] in both 128 bit lanes
+
+ # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
+ vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
+ vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
+
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+
+################################### RND N + 2 #########################################
+
+ vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
+ vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
+ vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
+ vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
+ # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
+
+ # Add sigma1 to the other compunents to get w[16] and w[17]
+ vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
+
+ # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
+ vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
+
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ xor g, y2 # y2 = f^g # CH
+
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+################################### RND N + 3 #########################################
+
+ vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
+ vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
+ vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
+ vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
+ # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
+
+ # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
+ # to newly calculated sigma1 to get w[18] and w[19]
+ vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
+
+ # Form w[19, w[18], w17], w[16]
+ vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ add y0, y2 # y2 = S1 + CH # --
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+
+ add y1, h # h = k + w + h + S0 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+ rotate_Ys
+.endm
+
+.macro DO_4ROUNDS
+
+################################### RND N + 0 #########################################
+
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 1 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 2 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 3 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+.endm
+
+########################################################################
+# void sha512_transform_rorx(struct sha512_block_state *state,
+# const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks
+########################################################################
+SYM_FUNC_START(sha512_transform_rorx)
+
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ # Allocate Stack Space
+ push %rbp
+ mov %rsp, %rbp
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+
+ shl $7, NUM_BLKS # convert to bytes
+ jz .Ldone_hash
+ add INP, NUM_BLKS # pointer to end of data
+ mov NUM_BLKS, frame_INPEND(%rsp)
+
+ ## load initial digest
+ mov 8*0(CTX1), a
+ mov 8*1(CTX1), b
+ mov 8*2(CTX1), c
+ mov 8*3(CTX1), d
+ mov 8*4(CTX1), e
+ mov 8*5(CTX1), f
+ mov 8*6(CTX1), g
+ mov 8*7(CTX1), h
+
+ # save %rdi (CTX) before it gets clobbered
+ mov %rdi, frame_CTX(%rsp)
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+
+.Lloop0:
+ lea K512(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
+
+ mov INP, frame_INP(%rsp)
+
+ ## schedule 64 input dwords, by doing 12 rounds of 4 each
+ movq $4, frame_SRND(%rsp)
+
+.align 16
+.Lloop1:
+ vpaddq (TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 1*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 2*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 3*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ add $(4*32), TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ subq $1, frame_SRND(%rsp)
+ jne .Lloop1
+
+ movq $2, frame_SRND(%rsp)
+.Lloop2:
+ vpaddq (TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ DO_4ROUNDS
+ vpaddq 1*32(TBL), Y_1, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ add $(2*32), TBL
+ DO_4ROUNDS
+
+ vmovdqa Y_2, Y_0
+ vmovdqa Y_3, Y_1
+
+ subq $1, frame_SRND(%rsp)
+ jne .Lloop2
+
+ mov frame_CTX(%rsp), CTX2
+ addm 8*0(CTX2), a
+ addm 8*1(CTX2), b
+ addm 8*2(CTX2), c
+ addm 8*3(CTX2), d
+ addm 8*4(CTX2), e
+ addm 8*5(CTX2), f
+ addm 8*6(CTX2), g
+ addm 8*7(CTX2), h
+
+ mov frame_INP(%rsp), INP
+ add $128, INP
+ cmp frame_INPEND(%rsp), INP
+ jne .Lloop0
+
+.Ldone_hash:
+
+ # Restore Stack Pointer
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+ vzeroupper
+ RET
+SYM_FUNC_END(sha512_transform_rorx)
+
+########################################################################
+### Binary Data
+
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+ .octa 0x18191a1b1c1d1e1f1011121314151617
+
+.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
+.align 32
+MASK_YMM_LO:
+ .octa 0x00000000000000000000000000000000
+ .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
--- /dev/null
+########################################################################
+# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest = %rdi
+# ARG2
+msg = %rsi
+# ARG3
+msglen = %rdx
+T1 = %rcx
+T2 = %r8
+a_64 = %r9
+b_64 = %r10
+c_64 = %r11
+d_64 = %r12
+e_64 = %r13
+f_64 = %r14
+g_64 = %r15
+h_64 = %rbx
+tmp0 = %rax
+
+# Local variables (stack frame)
+
+W_SIZE = 80*8
+WK_SIZE = 2*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_size = frame_WK + WK_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i) 8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i) 8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i) 8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+ # Rotate symbols a..h right
+ TMP = h_64
+ h_64 = g_64
+ g_64 = f_64
+ f_64 = e_64
+ e_64 = d_64
+ d_64 = c_64
+ c_64 = b_64
+ b_64 = a_64
+ a_64 = TMP
+.endm
+
+.macro SHA512_Round rnd
+
+ # Compute Round %%t
+ mov f_64, T1 # T1 = f
+ mov e_64, tmp0 # tmp = e
+ xor g_64, T1 # T1 = f ^ g
+ ror $23, tmp0 # 41 # tmp = e ror 23
+ and e_64, T1 # T1 = (f ^ g) & e
+ xor e_64, tmp0 # tmp = (e ror 23) ^ e
+ xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ idx = \rnd
+ add WK_2(idx), T1 # W[t] + K[t] from message scheduler
+ ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4
+ xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov a_64, T2 # T2 = a
+ add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
+ ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov a_64, tmp0 # tmp = a
+ xor c_64, T2 # T2 = a ^ c
+ and c_64, tmp0 # tmp = a & c
+ and b_64, T2 # T2 = (a ^ c) & b
+ xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov a_64, tmp0 # tmp = a
+ ror $5, tmp0 # 39 # tmp = a ror 5
+ xor a_64, tmp0 # tmp = (a ror 5) ^ a
+ add T1, d_64 # e(next_state) = d + T1
+ ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6
+ xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
+ ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_sse rnd
+
+ # Compute rounds t-2 and t-1
+ # Compute message schedule QWORDS t and t+1
+
+ # Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ # scheduler.
+ # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+ # They are then added to their respective SHA512 constants at
+ # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+ # For brievity, the comments following vectored instructions only refer to
+ # the first of a pair of QWORDS.
+ # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+ # The computation of the message schedule and the rounds are tightly
+ # stitched to take advantage of instruction-level parallelism.
+ # For clarity, integer instructions (for the rounds calculation) are indented
+ # by one tab. Vectored instructions (for the message scheduler) are indented
+ # by two tabs.
+
+ mov f_64, T1
+ idx = \rnd -2
+ movdqa W_t(idx), %xmm2 # XMM2 = W[t-2]
+ xor g_64, T1
+ and e_64, T1
+ movdqa %xmm2, %xmm0 # XMM0 = W[t-2]
+ xor g_64, T1
+ idx = \rnd
+ add WK_2(idx), T1
+ idx = \rnd - 15
+ movdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
+ mov e_64, tmp0
+ ror $23, tmp0 # 41
+ movdqa %xmm5, %xmm3 # XMM3 = W[t-15]
+ xor e_64, tmp0
+ ror $4, tmp0 # 18
+ psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42
+ xor e_64, tmp0
+ ror $14, tmp0 # 14
+ psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1
+ add tmp0, T1
+ add h_64, T1
+ pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2]
+ mov a_64, T2
+ xor c_64, T2
+ pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15]
+ and b_64, T2
+ mov a_64, tmp0
+ psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+ and c_64, tmp0
+ xor tmp0, T2
+ psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+ mov a_64, tmp0
+ ror $5, tmp0 # 39
+ pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+ xor a_64, tmp0
+ ror $6, tmp0 # 34
+ pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+ xor a_64, tmp0
+ ror $28, tmp0 # 28
+ psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+ add tmp0, T2
+ add T1, d_64
+ psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+ lea (T1, T2), h_64
+ RotateState
+ movdqa %xmm2, %xmm1 # XMM1 = W[t-2]
+ mov f_64, T1
+ xor g_64, T1
+ movdqa %xmm5, %xmm4 # XMM4 = W[t-15]
+ and e_64, T1
+ xor g_64, T1
+ psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42
+ idx = \rnd + 1
+ add WK_2(idx), T1
+ mov e_64, tmp0
+ psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7
+ ror $23, tmp0 # 41
+ xor e_64, tmp0
+ pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2]
+ ror $4, tmp0 # 18
+ xor e_64, tmp0
+ pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15]
+ ror $14, tmp0 # 14
+ add tmp0, T1
+ psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+ add h_64, T1
+ mov a_64, T2
+ psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+ xor c_64, T2
+ and b_64, T2
+ pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2])
+ mov a_64, tmp0
+ and c_64, tmp0
+ idx = \rnd - 7
+ movdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
+ xor tmp0, T2
+ pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15])
+ mov a_64, tmp0
+ paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15])
+ ror $5, tmp0 # 39
+ idx =\rnd-16
+ paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+ xor a_64, tmp0
+ paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+ ror $6, tmp0 # 34
+ movdqa %xmm0, W_t(\rnd) # Store scheduled qwords
+ xor a_64, tmp0
+ paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t]
+ ror $28, tmp0 # 28
+ idx = \rnd
+ movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
+ add tmp0, T2
+ add T1, d_64
+ lea (T1, T2), h_64
+ RotateState
+.endm
+
+########################################################################
+# void sha512_transform_ssse3(struct sha512_block_state *state,
+# const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks
+########################################################################
+SYM_FUNC_START(sha512_transform_ssse3)
+
+ test msglen, msglen
+ je .Lnowork
+
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ # Allocate Stack Space
+ push %rbp
+ mov %rsp, %rbp
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+
+.Lupdateblock:
+
+# Load state variables
+ mov DIGEST(0), a_64
+ mov DIGEST(1), b_64
+ mov DIGEST(2), c_64
+ mov DIGEST(3), d_64
+ mov DIGEST(4), e_64
+ mov DIGEST(5), f_64
+ mov DIGEST(6), g_64
+ mov DIGEST(7), h_64
+
+ t = 0
+ .rept 80/2 + 1
+ # (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ # +1 iteration because the scheduler leads hashing by 1 iteration
+ .if t < 2
+ # BSWAP 2 QWORDS
+ movdqa XMM_QWORD_BSWAP(%rip), %xmm1
+ movdqu MSG(t), %xmm0
+ pshufb %xmm1, %xmm0 # BSWAP
+ movdqa %xmm0, W_t(t) # Store Scheduled Pair
+ paddq K_t(t), %xmm0 # Compute W[t]+K[t]
+ movdqa %xmm0, WK_2(t) # Store into WK for rounds
+ .elseif t < 16
+ # BSWAP 2 QWORDS# Compute 2 Rounds
+ movdqu MSG(t), %xmm0
+ pshufb %xmm1, %xmm0 # BSWAP
+ SHA512_Round t-2 # Round t-2
+ movdqa %xmm0, W_t(t) # Store Scheduled Pair
+ paddq K_t(t), %xmm0 # Compute W[t]+K[t]
+ SHA512_Round t-1 # Round t-1
+ movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK
+ .elseif t < 79
+ # Schedule 2 QWORDS# Compute 2 Rounds
+ SHA512_2Sched_2Round_sse t
+ .else
+ # Compute 2 Rounds
+ SHA512_Round t-2
+ SHA512_Round t-1
+ .endif
+ t = t+2
+ .endr
+
+ # Update digest
+ add a_64, DIGEST(0)
+ add b_64, DIGEST(1)
+ add c_64, DIGEST(2)
+ add d_64, DIGEST(3)
+ add e_64, DIGEST(4)
+ add f_64, DIGEST(5)
+ add g_64, DIGEST(6)
+ add h_64, DIGEST(7)
+
+ # Advance to next message block
+ add $16*8, msg
+ dec msglen
+ jnz .Lupdateblock
+
+ # Restore Stack Pointer
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+.Lnowork:
+ RET
+SYM_FUNC_END(sha512_transform_ssse3)
+
+########################################################################
+### Binary Data
+
+.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * x86-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/fpu/api.h>
+#include <crypto/internal/simd.h>
+#include <linux/static_call.h>
+
+DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic);
+
+#define DEFINE_X86_SHA512_FN(c_fn, asm_fn) \
+ asmlinkage void asm_fn(struct sha512_block_state *state, \
+ const u8 *data, size_t nblocks); \
+ static void c_fn(struct sha512_block_state *state, const u8 *data, \
+ size_t nblocks) \
+ { \
+ if (likely(crypto_simd_usable())) { \
+ kernel_fpu_begin(); \
+ asm_fn(state, data, nblocks); \
+ kernel_fpu_end(); \
+ } else { \
+ sha512_blocks_generic(state, data, nblocks); \
+ } \
+ }
+
+DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3);
+DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx);
+DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx);
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ static_call(sha512_blocks_x86)(state, data, nblocks);
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static inline void sha512_mod_init_arch(void)
+{
+ if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) &&
+ boot_cpu_has(X86_FEATURE_AVX)) {
+ if (boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ static_call_update(sha512_blocks_x86,
+ sha512_blocks_avx2);
+ else
+ static_call_update(sha512_blocks_x86,
+ sha512_blocks_avx);
+ } else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+ static_call_update(sha512_blocks_x86, sha512_blocks_ssse3);
+ }
+}