Merge tag 'libnvdimm-for-5.4' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm...
[linux-2.6-block.git] / arch / x86 / crypto / sha256-avx2-asm.S
CommitLineData
d34a4600
TC
1########################################################################
2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7# James Guilford <james.guilford@intel.com>
8# Kirk Yap <kirk.s.yap@intel.com>
9# Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses. You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17# Redistribution and use in source and binary forms, with or
18# without modification, are permitted provided that the following
19# conditions are met:
20#
21# - Redistributions of source code must retain the above
22# copyright notice, this list of conditions and the following
23# disclaimer.
24#
25# - Redistributions in binary form must reproduce the above
26# copyright notice, this list of conditions and the following
27# disclaimer in the documentation and/or other materials
28# provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38#
39########################################################################
40#
41# This code is described in an Intel White-Paper:
42# "Fast SHA-256 Implementations on Intel Architecture Processors"
43#
44# To find it, surf to http://www.intel.com/p/en_US/embedded
45# and search for that title.
46#
47########################################################################
48# This code schedules 2 blocks at a time, with 4 lanes per block
49########################################################################
50
51#ifdef CONFIG_AS_AVX2
52#include <linux/linkage.h>
53
54## assume buffers not aligned
55#define VMOVDQ vmovdqu
56
57################################ Define Macros
58
59# addm [mem], reg
60# Add reg to mem using reg-mem add and store
61.macro addm p1 p2
62 add \p1, \p2
63 mov \p2, \p1
64.endm
65
66################################
67
68X0 = %ymm4
69X1 = %ymm5
70X2 = %ymm6
71X3 = %ymm7
72
73# XMM versions of above
74XWORD0 = %xmm4
75XWORD1 = %xmm5
76XWORD2 = %xmm6
77XWORD3 = %xmm7
78
79XTMP0 = %ymm0
80XTMP1 = %ymm1
81XTMP2 = %ymm2
82XTMP3 = %ymm3
83XTMP4 = %ymm8
84XFER = %ymm9
85XTMP5 = %ymm11
86
87SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
88SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
89BYTE_FLIP_MASK = %ymm13
90
91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
92
93NUM_BLKS = %rdx # 3rd arg
1631030a
AB
94INP = %rsi # 2nd arg
95CTX = %rdi # 1st arg
d34a4600
TC
96c = %ecx
97d = %r8d
98e = %edx # clobbers NUM_BLKS
1631030a 99y3 = %esi # clobbers INP
d34a4600 100
d34a4600
TC
101SRND = CTX # SRND is same register as CTX
102
103a = %eax
104b = %ebx
105f = %r9d
106g = %r10d
107h = %r11d
108old_h = %r11d
109
110T1 = %r12d
111y0 = %r13d
112y1 = %r14d
113y2 = %r15d
114
115
116_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
117_XMM_SAVE_SIZE = 0
118_INP_END_SIZE = 8
119_INP_SIZE = 8
120_CTX_SIZE = 8
121_RSP_SIZE = 8
122
123_XFER = 0
124_XMM_SAVE = _XFER + _XFER_SIZE
125_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
126_INP = _INP_END + _INP_END_SIZE
127_CTX = _INP + _INP_SIZE
128_RSP = _CTX + _CTX_SIZE
129STACK_SIZE = _RSP + _RSP_SIZE
130
131# rotate_Xs
132# Rotate values of symbols X0...X3
133.macro rotate_Xs
134 X_ = X0
135 X0 = X1
136 X1 = X2
137 X2 = X3
138 X3 = X_
139.endm
140
141# ROTATE_ARGS
142# Rotate values of symbols a...h
143.macro ROTATE_ARGS
144 old_h = h
145 TMP_ = h
146 h = g
147 g = f
148 f = e
149 e = d
150 d = c
151 c = b
152 b = a
153 a = TMP_
154.endm
155
156.macro FOUR_ROUNDS_AND_SCHED disp
157################################### RND N + 0 ############################
158
159 mov a, y3 # y3 = a # MAJA
160 rorx $25, e, y0 # y0 = e >> 25 # S1A
161 rorx $11, e, y1 # y1 = e >> 11 # S1B
162
163 addl \disp(%rsp, SRND), h # h = k + w + h # --
164 or c, y3 # y3 = a|c # MAJA
165 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
166 mov f, y2 # y2 = f # CH
167 rorx $13, a, T1 # T1 = a >> 13 # S0B
168
169 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
170 xor g, y2 # y2 = f^g # CH
171 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
172 rorx $6, e, y1 # y1 = (e >> 6) # S1
173
174 and e, y2 # y2 = (f^g)&e # CH
175 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
176 rorx $22, a, y1 # y1 = a >> 22 # S0A
177 add h, d # d = k + w + h + d # --
178
179 and b, y3 # y3 = (a|c)&b # MAJA
180 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
181 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
182 rorx $2, a, T1 # T1 = (a >> 2) # S0
183
184 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
185 vpsrld $7, XTMP1, XTMP2
186 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
187 mov a, T1 # T1 = a # MAJB
188 and c, T1 # T1 = a&c # MAJB
189
190 add y0, y2 # y2 = S1 + CH # --
191 vpslld $(32-7), XTMP1, XTMP3
192 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
193 add y1, h # h = k + w + h + S0 # --
194
195 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
196 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
197
198 vpsrld $18, XTMP1, XTMP2
199 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
200 add y3, h # h = t1 + S0 + MAJ # --
201
202
203 ROTATE_ARGS
204
205################################### RND N + 1 ############################
206
207 mov a, y3 # y3 = a # MAJA
208 rorx $25, e, y0 # y0 = e >> 25 # S1A
209 rorx $11, e, y1 # y1 = e >> 11 # S1B
210 offset = \disp + 1*4
211 addl offset(%rsp, SRND), h # h = k + w + h # --
212 or c, y3 # y3 = a|c # MAJA
213
214
215 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
216 mov f, y2 # y2 = f # CH
217 rorx $13, a, T1 # T1 = a >> 13 # S0B
218 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
219 xor g, y2 # y2 = f^g # CH
220
221
222 rorx $6, e, y1 # y1 = (e >> 6) # S1
223 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
224 rorx $22, a, y1 # y1 = a >> 22 # S0A
225 and e, y2 # y2 = (f^g)&e # CH
226 add h, d # d = k + w + h + d # --
227
228 vpslld $(32-18), XTMP1, XTMP1
229 and b, y3 # y3 = (a|c)&b # MAJA
230 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
231
232 vpxor XTMP1, XTMP3, XTMP3
233 rorx $2, a, T1 # T1 = (a >> 2) # S0
234 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
235
236 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
237 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
238 mov a, T1 # T1 = a # MAJB
239 and c, T1 # T1 = a&c # MAJB
240 add y0, y2 # y2 = S1 + CH # --
241
242 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
243 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
244 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
245 add y1, h # h = k + w + h + S0 # --
246
247 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
248 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
249 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
250 add y3, h # h = t1 + S0 + MAJ # --
251
252 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
253
254
255 ROTATE_ARGS
256
257################################### RND N + 2 ############################
258
259 mov a, y3 # y3 = a # MAJA
260 rorx $25, e, y0 # y0 = e >> 25 # S1A
261 offset = \disp + 2*4
262 addl offset(%rsp, SRND), h # h = k + w + h # --
263
264 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
265 rorx $11, e, y1 # y1 = e >> 11 # S1B
266 or c, y3 # y3 = a|c # MAJA
267 mov f, y2 # y2 = f # CH
268 xor g, y2 # y2 = f^g # CH
269
270 rorx $13, a, T1 # T1 = a >> 13 # S0B
271 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
272 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
273 and e, y2 # y2 = (f^g)&e # CH
274
275 rorx $6, e, y1 # y1 = (e >> 6) # S1
276 vpxor XTMP3, XTMP2, XTMP2
277 add h, d # d = k + w + h + d # --
278 and b, y3 # y3 = (a|c)&b # MAJA
279
280 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
281 rorx $22, a, y1 # y1 = a >> 22 # S0A
282 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
283 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
284
285 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
286 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
287 rorx $2, a ,T1 # T1 = (a >> 2) # S0
288 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
289
290 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
291 mov a, T1 # T1 = a # MAJB
292 and c, T1 # T1 = a&c # MAJB
293 add y0, y2 # y2 = S1 + CH # --
294 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
295
296 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
297 add y1,h # h = k + w + h + S0 # --
298 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
299 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
300
301 add y3,h # h = t1 + S0 + MAJ # --
302
303
304 ROTATE_ARGS
305
306################################### RND N + 3 ############################
307
308 mov a, y3 # y3 = a # MAJA
309 rorx $25, e, y0 # y0 = e >> 25 # S1A
310 rorx $11, e, y1 # y1 = e >> 11 # S1B
311 offset = \disp + 3*4
312 addl offset(%rsp, SRND), h # h = k + w + h # --
313 or c, y3 # y3 = a|c # MAJA
314
315
316 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
317 mov f, y2 # y2 = f # CH
318 rorx $13, a, T1 # T1 = a >> 13 # S0B
319 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
320 xor g, y2 # y2 = f^g # CH
321
322
323 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
324 rorx $6, e, y1 # y1 = (e >> 6) # S1
325 and e, y2 # y2 = (f^g)&e # CH
326 add h, d # d = k + w + h + d # --
327 and b, y3 # y3 = (a|c)&b # MAJA
328
329 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
330 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
331 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
332
333 vpxor XTMP3, XTMP2, XTMP2
334 rorx $22, a, y1 # y1 = a >> 22 # S0A
335 add y0, y2 # y2 = S1 + CH # --
336
337 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
338 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
339 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
340
341 rorx $2, a, T1 # T1 = (a >> 2) # S0
342 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
343
344 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
345 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
346 mov a, T1 # T1 = a # MAJB
347 and c, T1 # T1 = a&c # MAJB
348 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
349
350 add y1, h # h = k + w + h + S0 # --
351 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
352 add y3, h # h = t1 + S0 + MAJ # --
353
354 ROTATE_ARGS
355 rotate_Xs
356.endm
357
358.macro DO_4ROUNDS disp
359################################### RND N + 0 ###########################
360
361 mov f, y2 # y2 = f # CH
362 rorx $25, e, y0 # y0 = e >> 25 # S1A
363 rorx $11, e, y1 # y1 = e >> 11 # S1B
364 xor g, y2 # y2 = f^g # CH
365
366 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
367 rorx $6, e, y1 # y1 = (e >> 6) # S1
368 and e, y2 # y2 = (f^g)&e # CH
369
370 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
371 rorx $13, a, T1 # T1 = a >> 13 # S0B
372 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
373 rorx $22, a, y1 # y1 = a >> 22 # S0A
374 mov a, y3 # y3 = a # MAJA
375
376 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
377 rorx $2, a, T1 # T1 = (a >> 2) # S0
378 addl \disp(%rsp, SRND), h # h = k + w + h # --
379 or c, y3 # y3 = a|c # MAJA
380
381 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
382 mov a, T1 # T1 = a # MAJB
383 and b, y3 # y3 = (a|c)&b # MAJA
384 and c, T1 # T1 = a&c # MAJB
385 add y0, y2 # y2 = S1 + CH # --
386
387
388 add h, d # d = k + w + h + d # --
389 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
390 add y1, h # h = k + w + h + S0 # --
391 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
392
393 ROTATE_ARGS
394
395################################### RND N + 1 ###########################
396
397 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
398 mov f, y2 # y2 = f # CH
399 rorx $25, e, y0 # y0 = e >> 25 # S1A
400 rorx $11, e, y1 # y1 = e >> 11 # S1B
401 xor g, y2 # y2 = f^g # CH
402
403 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
404 rorx $6, e, y1 # y1 = (e >> 6) # S1
405 and e, y2 # y2 = (f^g)&e # CH
406 add y3, old_h # h = t1 + S0 + MAJ # --
407
408 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
409 rorx $13, a, T1 # T1 = a >> 13 # S0B
410 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
411 rorx $22, a, y1 # y1 = a >> 22 # S0A
412 mov a, y3 # y3 = a # MAJA
413
414 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
415 rorx $2, a, T1 # T1 = (a >> 2) # S0
416 offset = 4*1 + \disp
417 addl offset(%rsp, SRND), h # h = k + w + h # --
418 or c, y3 # y3 = a|c # MAJA
419
420 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
421 mov a, T1 # T1 = a # MAJB
422 and b, y3 # y3 = (a|c)&b # MAJA
423 and c, T1 # T1 = a&c # MAJB
424 add y0, y2 # y2 = S1 + CH # --
425
426
427 add h, d # d = k + w + h + d # --
428 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
429 add y1, h # h = k + w + h + S0 # --
430
431 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
432
433 ROTATE_ARGS
434
435################################### RND N + 2 ##############################
436
437 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
438 mov f, y2 # y2 = f # CH
439 rorx $25, e, y0 # y0 = e >> 25 # S1A
440 rorx $11, e, y1 # y1 = e >> 11 # S1B
441 xor g, y2 # y2 = f^g # CH
442
443 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
444 rorx $6, e, y1 # y1 = (e >> 6) # S1
445 and e, y2 # y2 = (f^g)&e # CH
446 add y3, old_h # h = t1 + S0 + MAJ # --
447
448 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
449 rorx $13, a, T1 # T1 = a >> 13 # S0B
450 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
451 rorx $22, a, y1 # y1 = a >> 22 # S0A
452 mov a, y3 # y3 = a # MAJA
453
454 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
455 rorx $2, a, T1 # T1 = (a >> 2) # S0
456 offset = 4*2 + \disp
457 addl offset(%rsp, SRND), h # h = k + w + h # --
458 or c, y3 # y3 = a|c # MAJA
459
460 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
461 mov a, T1 # T1 = a # MAJB
462 and b, y3 # y3 = (a|c)&b # MAJA
463 and c, T1 # T1 = a&c # MAJB
464 add y0, y2 # y2 = S1 + CH # --
465
466
467 add h, d # d = k + w + h + d # --
468 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
469 add y1, h # h = k + w + h + S0 # --
470
471 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
472
473 ROTATE_ARGS
474
475################################### RND N + 3 ###########################
476
477 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
478 mov f, y2 # y2 = f # CH
479 rorx $25, e, y0 # y0 = e >> 25 # S1A
480 rorx $11, e, y1 # y1 = e >> 11 # S1B
481 xor g, y2 # y2 = f^g # CH
482
483 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
484 rorx $6, e, y1 # y1 = (e >> 6) # S1
485 and e, y2 # y2 = (f^g)&e # CH
486 add y3, old_h # h = t1 + S0 + MAJ # --
487
488 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
489 rorx $13, a, T1 # T1 = a >> 13 # S0B
490 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
491 rorx $22, a, y1 # y1 = a >> 22 # S0A
492 mov a, y3 # y3 = a # MAJA
493
494 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
495 rorx $2, a, T1 # T1 = (a >> 2) # S0
496 offset = 4*3 + \disp
497 addl offset(%rsp, SRND), h # h = k + w + h # --
498 or c, y3 # y3 = a|c # MAJA
499
500 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
501 mov a, T1 # T1 = a # MAJB
502 and b, y3 # y3 = (a|c)&b # MAJA
503 and c, T1 # T1 = a&c # MAJB
504 add y0, y2 # y2 = S1 + CH # --
505
506
507 add h, d # d = k + w + h + d # --
508 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
509 add y1, h # h = k + w + h + S0 # --
510
511 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
512
513
514 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
515
516 add y3, h # h = t1 + S0 + MAJ # --
517
518 ROTATE_ARGS
519
520.endm
521
522########################################################################
523## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
1631030a
AB
524## arg 1 : pointer to digest
525## arg 2 : pointer to input data
d34a4600
TC
526## arg 3 : Num blocks
527########################################################################
528.text
529ENTRY(sha256_transform_rorx)
530.align 32
531 pushq %rbx
d34a4600
TC
532 pushq %r12
533 pushq %r13
534 pushq %r14
535 pushq %r15
536
537 mov %rsp, %rax
538 subq $STACK_SIZE, %rsp
539 and $-32, %rsp # align rsp to 32 byte boundary
540 mov %rax, _RSP(%rsp)
541
542
543 shl $6, NUM_BLKS # convert to bytes
544 jz done_hash
545 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
546 mov NUM_BLKS, _INP_END(%rsp)
547
548 cmp NUM_BLKS, INP
549 je only_one_block
550
551 ## load initial digest
552 mov (CTX), a
553 mov 4*1(CTX), b
554 mov 4*2(CTX), c
555 mov 4*3(CTX), d
556 mov 4*4(CTX), e
557 mov 4*5(CTX), f
558 mov 4*6(CTX), g
559 mov 4*7(CTX), h
560
561 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
562 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
563 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
564
565 mov CTX, _CTX(%rsp)
566
567loop0:
d34a4600
TC
568 ## Load first 16 dwords from two blocks
569 VMOVDQ 0*32(INP),XTMP0
570 VMOVDQ 1*32(INP),XTMP1
571 VMOVDQ 2*32(INP),XTMP2
572 VMOVDQ 3*32(INP),XTMP3
573
574 ## byte swap data
575 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
576 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
577 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
578 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
579
580 ## transpose data into high/low halves
581 vperm2i128 $0x20, XTMP2, XTMP0, X0
582 vperm2i128 $0x31, XTMP2, XTMP0, X1
583 vperm2i128 $0x20, XTMP3, XTMP1, X2
584 vperm2i128 $0x31, XTMP3, XTMP1, X3
585
586last_block_enter:
587 add $64, INP
588 mov INP, _INP(%rsp)
589
590 ## schedule 48 input dwords, by doing 3 rounds of 12 each
591 xor SRND, SRND
592
593.align 16
594loop1:
d3dfbfe2 595 vpaddd K256+0*32(SRND), X0, XFER
d34a4600
TC
596 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
597 FOUR_ROUNDS_AND_SCHED _XFER + 0*32
598
d3dfbfe2 599 vpaddd K256+1*32(SRND), X0, XFER
d34a4600
TC
600 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
601 FOUR_ROUNDS_AND_SCHED _XFER + 1*32
602
d3dfbfe2 603 vpaddd K256+2*32(SRND), X0, XFER
d34a4600
TC
604 vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
605 FOUR_ROUNDS_AND_SCHED _XFER + 2*32
606
d3dfbfe2 607 vpaddd K256+3*32(SRND), X0, XFER
d34a4600
TC
608 vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
609 FOUR_ROUNDS_AND_SCHED _XFER + 3*32
610
611 add $4*32, SRND
612 cmp $3*4*32, SRND
613 jb loop1
614
615loop2:
616 ## Do last 16 rounds with no scheduling
d3dfbfe2 617 vpaddd K256+0*32(SRND), X0, XFER
d34a4600
TC
618 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
619 DO_4ROUNDS _XFER + 0*32
d3dfbfe2
JP
620
621 vpaddd K256+1*32(SRND), X1, XFER
d34a4600
TC
622 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
623 DO_4ROUNDS _XFER + 1*32
624 add $2*32, SRND
625
626 vmovdqa X2, X0
627 vmovdqa X3, X1
628
629 cmp $4*4*32, SRND
630 jb loop2
631
632 mov _CTX(%rsp), CTX
633 mov _INP(%rsp), INP
634
635 addm (4*0)(CTX),a
636 addm (4*1)(CTX),b
637 addm (4*2)(CTX),c
638 addm (4*3)(CTX),d
639 addm (4*4)(CTX),e
640 addm (4*5)(CTX),f
641 addm (4*6)(CTX),g
642 addm (4*7)(CTX),h
643
644 cmp _INP_END(%rsp), INP
645 ja done_hash
646
647 #### Do second block using previously scheduled results
648 xor SRND, SRND
649.align 16
650loop3:
651 DO_4ROUNDS _XFER + 0*32 + 16
652 DO_4ROUNDS _XFER + 1*32 + 16
653 add $2*32, SRND
654 cmp $4*4*32, SRND
655 jb loop3
656
657 mov _CTX(%rsp), CTX
658 mov _INP(%rsp), INP
659 add $64, INP
660
661 addm (4*0)(CTX),a
662 addm (4*1)(CTX),b
663 addm (4*2)(CTX),c
664 addm (4*3)(CTX),d
665 addm (4*4)(CTX),e
666 addm (4*5)(CTX),f
667 addm (4*6)(CTX),g
668 addm (4*7)(CTX),h
669
670 cmp _INP_END(%rsp), INP
671 jb loop0
672 ja done_hash
673
674do_last_block:
d34a4600
TC
675 VMOVDQ 0*16(INP),XWORD0
676 VMOVDQ 1*16(INP),XWORD1
677 VMOVDQ 2*16(INP),XWORD2
678 VMOVDQ 3*16(INP),XWORD3
679
680 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
681 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
682 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
683 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
684
685 jmp last_block_enter
686
687only_one_block:
688
689 ## load initial digest
690 mov (4*0)(CTX),a
691 mov (4*1)(CTX),b
692 mov (4*2)(CTX),c
693 mov (4*3)(CTX),d
694 mov (4*4)(CTX),e
695 mov (4*5)(CTX),f
696 mov (4*6)(CTX),g
697 mov (4*7)(CTX),h
698
699 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
700 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
701 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
702
703 mov CTX, _CTX(%rsp)
704 jmp do_last_block
705
706done_hash:
707
708 mov _RSP(%rsp), %rsp
709
710 popq %r15
711 popq %r14
712 popq %r13
713 popq %r12
d34a4600
TC
714 popq %rbx
715 ret
716ENDPROC(sha256_transform_rorx)
717
e183914a 718.section .rodata.cst512.K256, "aM", @progbits, 512
d34a4600
TC
719.align 64
720K256:
721 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
722 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
723 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
724 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
725 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
726 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
727 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
728 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
729 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
730 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
731 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
732 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
733 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
734 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
735 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
736 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
737 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
738 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
739 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
740 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
741 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
742 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
743 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
744 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
745 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
746 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
747 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
748 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
749 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
750 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
751 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
752 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
753
e183914a
DV
754.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
755.align 32
d34a4600
TC
756PSHUFFLE_BYTE_FLIP_MASK:
757 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
758
759# shuffle xBxA -> 00BA
e183914a
DV
760.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
761.align 32
d34a4600
TC
762_SHUF_00BA:
763 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
764
765# shuffle xDxC -> DC00
e183914a
DV
766.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
767.align 32
d34a4600
TC
768_SHUF_DC00:
769 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
e183914a 770
d34a4600 771#endif