Commit | Line | Data |
---|---|---|
d34a4600 TC |
1 | ######################################################################## |
2 | # Implement fast SHA-256 with AVX2 instructions. (x86_64) | |
3 | # | |
4 | # Copyright (C) 2013 Intel Corporation. | |
5 | # | |
6 | # Authors: | |
7 | # James Guilford <james.guilford@intel.com> | |
8 | # Kirk Yap <kirk.s.yap@intel.com> | |
9 | # Tim Chen <tim.c.chen@linux.intel.com> | |
10 | # | |
11 | # This software is available to you under a choice of one of two | |
12 | # licenses. You may choose to be licensed under the terms of the GNU | |
13 | # General Public License (GPL) Version 2, available from the file | |
14 | # COPYING in the main directory of this source tree, or the | |
15 | # OpenIB.org BSD license below: | |
16 | # | |
17 | # Redistribution and use in source and binary forms, with or | |
18 | # without modification, are permitted provided that the following | |
19 | # conditions are met: | |
20 | # | |
21 | # - Redistributions of source code must retain the above | |
22 | # copyright notice, this list of conditions and the following | |
23 | # disclaimer. | |
24 | # | |
25 | # - Redistributions in binary form must reproduce the above | |
26 | # copyright notice, this list of conditions and the following | |
27 | # disclaimer in the documentation and/or other materials | |
28 | # provided with the distribution. | |
29 | # | |
30 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
31 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
32 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
33 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
34 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
35 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
36 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
37 | # SOFTWARE. | |
38 | # | |
39 | ######################################################################## | |
40 | # | |
41 | # This code is described in an Intel White-Paper: | |
42 | # "Fast SHA-256 Implementations on Intel Architecture Processors" | |
43 | # | |
44 | # To find it, surf to http://www.intel.com/p/en_US/embedded | |
45 | # and search for that title. | |
46 | # | |
47 | ######################################################################## | |
48 | # This code schedules 2 blocks at a time, with 4 lanes per block | |
49 | ######################################################################## | |
50 | ||
51 | #ifdef CONFIG_AS_AVX2 | |
52 | #include <linux/linkage.h> | |
53 | ||
54 | ## assume buffers not aligned | |
55 | #define VMOVDQ vmovdqu | |
56 | ||
57 | ################################ Define Macros | |
58 | ||
59 | # addm [mem], reg | |
60 | # Add reg to mem using reg-mem add and store | |
61 | .macro addm p1 p2 | |
62 | add \p1, \p2 | |
63 | mov \p2, \p1 | |
64 | .endm | |
65 | ||
66 | ################################ | |
67 | ||
68 | X0 = %ymm4 | |
69 | X1 = %ymm5 | |
70 | X2 = %ymm6 | |
71 | X3 = %ymm7 | |
72 | ||
73 | # XMM versions of above | |
74 | XWORD0 = %xmm4 | |
75 | XWORD1 = %xmm5 | |
76 | XWORD2 = %xmm6 | |
77 | XWORD3 = %xmm7 | |
78 | ||
79 | XTMP0 = %ymm0 | |
80 | XTMP1 = %ymm1 | |
81 | XTMP2 = %ymm2 | |
82 | XTMP3 = %ymm3 | |
83 | XTMP4 = %ymm8 | |
84 | XFER = %ymm9 | |
85 | XTMP5 = %ymm11 | |
86 | ||
87 | SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA | |
88 | SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 | |
89 | BYTE_FLIP_MASK = %ymm13 | |
90 | ||
91 | X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK | |
92 | ||
93 | NUM_BLKS = %rdx # 3rd arg | |
1631030a AB |
94 | INP = %rsi # 2nd arg |
95 | CTX = %rdi # 1st arg | |
d34a4600 TC |
96 | c = %ecx |
97 | d = %r8d | |
98 | e = %edx # clobbers NUM_BLKS | |
1631030a | 99 | y3 = %esi # clobbers INP |
d34a4600 | 100 | |
d34a4600 TC |
101 | SRND = CTX # SRND is same register as CTX |
102 | ||
103 | a = %eax | |
104 | b = %ebx | |
105 | f = %r9d | |
106 | g = %r10d | |
107 | h = %r11d | |
108 | old_h = %r11d | |
109 | ||
110 | T1 = %r12d | |
111 | y0 = %r13d | |
112 | y1 = %r14d | |
113 | y2 = %r15d | |
114 | ||
115 | ||
116 | _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round | |
117 | _XMM_SAVE_SIZE = 0 | |
118 | _INP_END_SIZE = 8 | |
119 | _INP_SIZE = 8 | |
120 | _CTX_SIZE = 8 | |
121 | _RSP_SIZE = 8 | |
122 | ||
123 | _XFER = 0 | |
124 | _XMM_SAVE = _XFER + _XFER_SIZE | |
125 | _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE | |
126 | _INP = _INP_END + _INP_END_SIZE | |
127 | _CTX = _INP + _INP_SIZE | |
128 | _RSP = _CTX + _CTX_SIZE | |
129 | STACK_SIZE = _RSP + _RSP_SIZE | |
130 | ||
131 | # rotate_Xs | |
132 | # Rotate values of symbols X0...X3 | |
133 | .macro rotate_Xs | |
134 | X_ = X0 | |
135 | X0 = X1 | |
136 | X1 = X2 | |
137 | X2 = X3 | |
138 | X3 = X_ | |
139 | .endm | |
140 | ||
141 | # ROTATE_ARGS | |
142 | # Rotate values of symbols a...h | |
143 | .macro ROTATE_ARGS | |
144 | old_h = h | |
145 | TMP_ = h | |
146 | h = g | |
147 | g = f | |
148 | f = e | |
149 | e = d | |
150 | d = c | |
151 | c = b | |
152 | b = a | |
153 | a = TMP_ | |
154 | .endm | |
155 | ||
156 | .macro FOUR_ROUNDS_AND_SCHED disp | |
157 | ################################### RND N + 0 ############################ | |
158 | ||
159 | mov a, y3 # y3 = a # MAJA | |
160 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
161 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
162 | ||
163 | addl \disp(%rsp, SRND), h # h = k + w + h # -- | |
164 | or c, y3 # y3 = a|c # MAJA | |
165 | vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] | |
166 | mov f, y2 # y2 = f # CH | |
167 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
168 | ||
169 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
170 | xor g, y2 # y2 = f^g # CH | |
171 | vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 | |
172 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
173 | ||
174 | and e, y2 # y2 = (f^g)&e # CH | |
175 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
176 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
177 | add h, d # d = k + w + h + d # -- | |
178 | ||
179 | and b, y3 # y3 = (a|c)&b # MAJA | |
180 | vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] | |
181 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
182 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
183 | ||
184 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
185 | vpsrld $7, XTMP1, XTMP2 | |
186 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
187 | mov a, T1 # T1 = a # MAJB | |
188 | and c, T1 # T1 = a&c # MAJB | |
189 | ||
190 | add y0, y2 # y2 = S1 + CH # -- | |
191 | vpslld $(32-7), XTMP1, XTMP3 | |
192 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
193 | add y1, h # h = k + w + h + S0 # -- | |
194 | ||
195 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
196 | vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 | |
197 | ||
198 | vpsrld $18, XTMP1, XTMP2 | |
199 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
200 | add y3, h # h = t1 + S0 + MAJ # -- | |
201 | ||
202 | ||
203 | ROTATE_ARGS | |
204 | ||
205 | ################################### RND N + 1 ############################ | |
206 | ||
207 | mov a, y3 # y3 = a # MAJA | |
208 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
209 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
210 | offset = \disp + 1*4 | |
211 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
212 | or c, y3 # y3 = a|c # MAJA | |
213 | ||
214 | ||
215 | vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 | |
216 | mov f, y2 # y2 = f # CH | |
217 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
218 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
219 | xor g, y2 # y2 = f^g # CH | |
220 | ||
221 | ||
222 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
223 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
224 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
225 | and e, y2 # y2 = (f^g)&e # CH | |
226 | add h, d # d = k + w + h + d # -- | |
227 | ||
228 | vpslld $(32-18), XTMP1, XTMP1 | |
229 | and b, y3 # y3 = (a|c)&b # MAJA | |
230 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
231 | ||
232 | vpxor XTMP1, XTMP3, XTMP3 | |
233 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
234 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
235 | ||
236 | vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 | |
237 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
238 | mov a, T1 # T1 = a # MAJB | |
239 | and c, T1 # T1 = a&c # MAJB | |
240 | add y0, y2 # y2 = S1 + CH # -- | |
241 | ||
242 | vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 | |
243 | vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} | |
244 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
245 | add y1, h # h = k + w + h + S0 # -- | |
246 | ||
247 | vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 | |
248 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
249 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
250 | add y3, h # h = t1 + S0 + MAJ # -- | |
251 | ||
252 | vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} | |
253 | ||
254 | ||
255 | ROTATE_ARGS | |
256 | ||
257 | ################################### RND N + 2 ############################ | |
258 | ||
259 | mov a, y3 # y3 = a # MAJA | |
260 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
261 | offset = \disp + 2*4 | |
262 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
263 | ||
264 | vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} | |
265 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
266 | or c, y3 # y3 = a|c # MAJA | |
267 | mov f, y2 # y2 = f # CH | |
268 | xor g, y2 # y2 = f^g # CH | |
269 | ||
270 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
271 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
272 | vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} | |
273 | and e, y2 # y2 = (f^g)&e # CH | |
274 | ||
275 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
276 | vpxor XTMP3, XTMP2, XTMP2 | |
277 | add h, d # d = k + w + h + d # -- | |
278 | and b, y3 # y3 = (a|c)&b # MAJA | |
279 | ||
280 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
281 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
282 | vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} | |
283 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
284 | ||
285 | vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} | |
286 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
287 | rorx $2, a ,T1 # T1 = (a >> 2) # S0 | |
288 | vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} | |
289 | ||
290 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
291 | mov a, T1 # T1 = a # MAJB | |
292 | and c, T1 # T1 = a&c # MAJB | |
293 | add y0, y2 # y2 = S1 + CH # -- | |
294 | vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} | |
295 | ||
296 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
297 | add y1,h # h = k + w + h + S0 # -- | |
298 | add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
299 | add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
300 | ||
301 | add y3,h # h = t1 + S0 + MAJ # -- | |
302 | ||
303 | ||
304 | ROTATE_ARGS | |
305 | ||
306 | ################################### RND N + 3 ############################ | |
307 | ||
308 | mov a, y3 # y3 = a # MAJA | |
309 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
310 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
311 | offset = \disp + 3*4 | |
312 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
313 | or c, y3 # y3 = a|c # MAJA | |
314 | ||
315 | ||
316 | vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} | |
317 | mov f, y2 # y2 = f # CH | |
318 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
319 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
320 | xor g, y2 # y2 = f^g # CH | |
321 | ||
322 | ||
323 | vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} | |
324 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
325 | and e, y2 # y2 = (f^g)&e # CH | |
326 | add h, d # d = k + w + h + d # -- | |
327 | and b, y3 # y3 = (a|c)&b # MAJA | |
328 | ||
329 | vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} | |
330 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
331 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
332 | ||
333 | vpxor XTMP3, XTMP2, XTMP2 | |
334 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
335 | add y0, y2 # y2 = S1 + CH # -- | |
336 | ||
337 | vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} | |
338 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
339 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
340 | ||
341 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
342 | vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} | |
343 | ||
344 | vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} | |
345 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
346 | mov a, T1 # T1 = a # MAJB | |
347 | and c, T1 # T1 = a&c # MAJB | |
348 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
349 | ||
350 | add y1, h # h = k + w + h + S0 # -- | |
351 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
352 | add y3, h # h = t1 + S0 + MAJ # -- | |
353 | ||
354 | ROTATE_ARGS | |
355 | rotate_Xs | |
356 | .endm | |
357 | ||
358 | .macro DO_4ROUNDS disp | |
359 | ################################### RND N + 0 ########################### | |
360 | ||
361 | mov f, y2 # y2 = f # CH | |
362 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
363 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
364 | xor g, y2 # y2 = f^g # CH | |
365 | ||
366 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
367 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
368 | and e, y2 # y2 = (f^g)&e # CH | |
369 | ||
370 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
371 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
372 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
373 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
374 | mov a, y3 # y3 = a # MAJA | |
375 | ||
376 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
377 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
378 | addl \disp(%rsp, SRND), h # h = k + w + h # -- | |
379 | or c, y3 # y3 = a|c # MAJA | |
380 | ||
381 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
382 | mov a, T1 # T1 = a # MAJB | |
383 | and b, y3 # y3 = (a|c)&b # MAJA | |
384 | and c, T1 # T1 = a&c # MAJB | |
385 | add y0, y2 # y2 = S1 + CH # -- | |
386 | ||
387 | ||
388 | add h, d # d = k + w + h + d # -- | |
389 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
390 | add y1, h # h = k + w + h + S0 # -- | |
391 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
392 | ||
393 | ROTATE_ARGS | |
394 | ||
395 | ################################### RND N + 1 ########################### | |
396 | ||
397 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
398 | mov f, y2 # y2 = f # CH | |
399 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
400 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
401 | xor g, y2 # y2 = f^g # CH | |
402 | ||
403 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
404 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
405 | and e, y2 # y2 = (f^g)&e # CH | |
406 | add y3, old_h # h = t1 + S0 + MAJ # -- | |
407 | ||
408 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
409 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
410 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
411 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
412 | mov a, y3 # y3 = a # MAJA | |
413 | ||
414 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
415 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
416 | offset = 4*1 + \disp | |
417 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
418 | or c, y3 # y3 = a|c # MAJA | |
419 | ||
420 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
421 | mov a, T1 # T1 = a # MAJB | |
422 | and b, y3 # y3 = (a|c)&b # MAJA | |
423 | and c, T1 # T1 = a&c # MAJB | |
424 | add y0, y2 # y2 = S1 + CH # -- | |
425 | ||
426 | ||
427 | add h, d # d = k + w + h + d # -- | |
428 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
429 | add y1, h # h = k + w + h + S0 # -- | |
430 | ||
431 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
432 | ||
433 | ROTATE_ARGS | |
434 | ||
435 | ################################### RND N + 2 ############################## | |
436 | ||
437 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
438 | mov f, y2 # y2 = f # CH | |
439 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
440 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
441 | xor g, y2 # y2 = f^g # CH | |
442 | ||
443 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
444 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
445 | and e, y2 # y2 = (f^g)&e # CH | |
446 | add y3, old_h # h = t1 + S0 + MAJ # -- | |
447 | ||
448 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
449 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
450 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
451 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
452 | mov a, y3 # y3 = a # MAJA | |
453 | ||
454 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
455 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
456 | offset = 4*2 + \disp | |
457 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
458 | or c, y3 # y3 = a|c # MAJA | |
459 | ||
460 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
461 | mov a, T1 # T1 = a # MAJB | |
462 | and b, y3 # y3 = (a|c)&b # MAJA | |
463 | and c, T1 # T1 = a&c # MAJB | |
464 | add y0, y2 # y2 = S1 + CH # -- | |
465 | ||
466 | ||
467 | add h, d # d = k + w + h + d # -- | |
468 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
469 | add y1, h # h = k + w + h + S0 # -- | |
470 | ||
471 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
472 | ||
473 | ROTATE_ARGS | |
474 | ||
475 | ################################### RND N + 3 ########################### | |
476 | ||
477 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
478 | mov f, y2 # y2 = f # CH | |
479 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
480 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
481 | xor g, y2 # y2 = f^g # CH | |
482 | ||
483 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
484 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
485 | and e, y2 # y2 = (f^g)&e # CH | |
486 | add y3, old_h # h = t1 + S0 + MAJ # -- | |
487 | ||
488 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
489 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
490 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
491 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
492 | mov a, y3 # y3 = a # MAJA | |
493 | ||
494 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
495 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
496 | offset = 4*3 + \disp | |
497 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
498 | or c, y3 # y3 = a|c # MAJA | |
499 | ||
500 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
501 | mov a, T1 # T1 = a # MAJB | |
502 | and b, y3 # y3 = (a|c)&b # MAJA | |
503 | and c, T1 # T1 = a&c # MAJB | |
504 | add y0, y2 # y2 = S1 + CH # -- | |
505 | ||
506 | ||
507 | add h, d # d = k + w + h + d # -- | |
508 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
509 | add y1, h # h = k + w + h + S0 # -- | |
510 | ||
511 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
512 | ||
513 | ||
514 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
515 | ||
516 | add y3, h # h = t1 + S0 + MAJ # -- | |
517 | ||
518 | ROTATE_ARGS | |
519 | ||
520 | .endm | |
521 | ||
522 | ######################################################################## | |
523 | ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) | |
1631030a AB |
524 | ## arg 1 : pointer to digest |
525 | ## arg 2 : pointer to input data | |
d34a4600 TC |
526 | ## arg 3 : Num blocks |
527 | ######################################################################## | |
528 | .text | |
529 | ENTRY(sha256_transform_rorx) | |
530 | .align 32 | |
531 | pushq %rbx | |
d34a4600 TC |
532 | pushq %r12 |
533 | pushq %r13 | |
534 | pushq %r14 | |
535 | pushq %r15 | |
536 | ||
537 | mov %rsp, %rax | |
538 | subq $STACK_SIZE, %rsp | |
539 | and $-32, %rsp # align rsp to 32 byte boundary | |
540 | mov %rax, _RSP(%rsp) | |
541 | ||
542 | ||
543 | shl $6, NUM_BLKS # convert to bytes | |
544 | jz done_hash | |
545 | lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block | |
546 | mov NUM_BLKS, _INP_END(%rsp) | |
547 | ||
548 | cmp NUM_BLKS, INP | |
549 | je only_one_block | |
550 | ||
551 | ## load initial digest | |
552 | mov (CTX), a | |
553 | mov 4*1(CTX), b | |
554 | mov 4*2(CTX), c | |
555 | mov 4*3(CTX), d | |
556 | mov 4*4(CTX), e | |
557 | mov 4*5(CTX), f | |
558 | mov 4*6(CTX), g | |
559 | mov 4*7(CTX), h | |
560 | ||
561 | vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK | |
562 | vmovdqa _SHUF_00BA(%rip), SHUF_00BA | |
563 | vmovdqa _SHUF_DC00(%rip), SHUF_DC00 | |
564 | ||
565 | mov CTX, _CTX(%rsp) | |
566 | ||
567 | loop0: | |
d34a4600 TC |
568 | ## Load first 16 dwords from two blocks |
569 | VMOVDQ 0*32(INP),XTMP0 | |
570 | VMOVDQ 1*32(INP),XTMP1 | |
571 | VMOVDQ 2*32(INP),XTMP2 | |
572 | VMOVDQ 3*32(INP),XTMP3 | |
573 | ||
574 | ## byte swap data | |
575 | vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 | |
576 | vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 | |
577 | vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 | |
578 | vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 | |
579 | ||
580 | ## transpose data into high/low halves | |
581 | vperm2i128 $0x20, XTMP2, XTMP0, X0 | |
582 | vperm2i128 $0x31, XTMP2, XTMP0, X1 | |
583 | vperm2i128 $0x20, XTMP3, XTMP1, X2 | |
584 | vperm2i128 $0x31, XTMP3, XTMP1, X3 | |
585 | ||
586 | last_block_enter: | |
587 | add $64, INP | |
588 | mov INP, _INP(%rsp) | |
589 | ||
590 | ## schedule 48 input dwords, by doing 3 rounds of 12 each | |
591 | xor SRND, SRND | |
592 | ||
593 | .align 16 | |
594 | loop1: | |
d3dfbfe2 | 595 | vpaddd K256+0*32(SRND), X0, XFER |
d34a4600 TC |
596 | vmovdqa XFER, 0*32+_XFER(%rsp, SRND) |
597 | FOUR_ROUNDS_AND_SCHED _XFER + 0*32 | |
598 | ||
d3dfbfe2 | 599 | vpaddd K256+1*32(SRND), X0, XFER |
d34a4600 TC |
600 | vmovdqa XFER, 1*32+_XFER(%rsp, SRND) |
601 | FOUR_ROUNDS_AND_SCHED _XFER + 1*32 | |
602 | ||
d3dfbfe2 | 603 | vpaddd K256+2*32(SRND), X0, XFER |
d34a4600 TC |
604 | vmovdqa XFER, 2*32+_XFER(%rsp, SRND) |
605 | FOUR_ROUNDS_AND_SCHED _XFER + 2*32 | |
606 | ||
d3dfbfe2 | 607 | vpaddd K256+3*32(SRND), X0, XFER |
d34a4600 TC |
608 | vmovdqa XFER, 3*32+_XFER(%rsp, SRND) |
609 | FOUR_ROUNDS_AND_SCHED _XFER + 3*32 | |
610 | ||
611 | add $4*32, SRND | |
612 | cmp $3*4*32, SRND | |
613 | jb loop1 | |
614 | ||
615 | loop2: | |
616 | ## Do last 16 rounds with no scheduling | |
d3dfbfe2 | 617 | vpaddd K256+0*32(SRND), X0, XFER |
d34a4600 TC |
618 | vmovdqa XFER, 0*32+_XFER(%rsp, SRND) |
619 | DO_4ROUNDS _XFER + 0*32 | |
d3dfbfe2 JP |
620 | |
621 | vpaddd K256+1*32(SRND), X1, XFER | |
d34a4600 TC |
622 | vmovdqa XFER, 1*32+_XFER(%rsp, SRND) |
623 | DO_4ROUNDS _XFER + 1*32 | |
624 | add $2*32, SRND | |
625 | ||
626 | vmovdqa X2, X0 | |
627 | vmovdqa X3, X1 | |
628 | ||
629 | cmp $4*4*32, SRND | |
630 | jb loop2 | |
631 | ||
632 | mov _CTX(%rsp), CTX | |
633 | mov _INP(%rsp), INP | |
634 | ||
635 | addm (4*0)(CTX),a | |
636 | addm (4*1)(CTX),b | |
637 | addm (4*2)(CTX),c | |
638 | addm (4*3)(CTX),d | |
639 | addm (4*4)(CTX),e | |
640 | addm (4*5)(CTX),f | |
641 | addm (4*6)(CTX),g | |
642 | addm (4*7)(CTX),h | |
643 | ||
644 | cmp _INP_END(%rsp), INP | |
645 | ja done_hash | |
646 | ||
647 | #### Do second block using previously scheduled results | |
648 | xor SRND, SRND | |
649 | .align 16 | |
650 | loop3: | |
651 | DO_4ROUNDS _XFER + 0*32 + 16 | |
652 | DO_4ROUNDS _XFER + 1*32 + 16 | |
653 | add $2*32, SRND | |
654 | cmp $4*4*32, SRND | |
655 | jb loop3 | |
656 | ||
657 | mov _CTX(%rsp), CTX | |
658 | mov _INP(%rsp), INP | |
659 | add $64, INP | |
660 | ||
661 | addm (4*0)(CTX),a | |
662 | addm (4*1)(CTX),b | |
663 | addm (4*2)(CTX),c | |
664 | addm (4*3)(CTX),d | |
665 | addm (4*4)(CTX),e | |
666 | addm (4*5)(CTX),f | |
667 | addm (4*6)(CTX),g | |
668 | addm (4*7)(CTX),h | |
669 | ||
670 | cmp _INP_END(%rsp), INP | |
671 | jb loop0 | |
672 | ja done_hash | |
673 | ||
674 | do_last_block: | |
d34a4600 TC |
675 | VMOVDQ 0*16(INP),XWORD0 |
676 | VMOVDQ 1*16(INP),XWORD1 | |
677 | VMOVDQ 2*16(INP),XWORD2 | |
678 | VMOVDQ 3*16(INP),XWORD3 | |
679 | ||
680 | vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 | |
681 | vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 | |
682 | vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 | |
683 | vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 | |
684 | ||
685 | jmp last_block_enter | |
686 | ||
687 | only_one_block: | |
688 | ||
689 | ## load initial digest | |
690 | mov (4*0)(CTX),a | |
691 | mov (4*1)(CTX),b | |
692 | mov (4*2)(CTX),c | |
693 | mov (4*3)(CTX),d | |
694 | mov (4*4)(CTX),e | |
695 | mov (4*5)(CTX),f | |
696 | mov (4*6)(CTX),g | |
697 | mov (4*7)(CTX),h | |
698 | ||
699 | vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK | |
700 | vmovdqa _SHUF_00BA(%rip), SHUF_00BA | |
701 | vmovdqa _SHUF_DC00(%rip), SHUF_DC00 | |
702 | ||
703 | mov CTX, _CTX(%rsp) | |
704 | jmp do_last_block | |
705 | ||
706 | done_hash: | |
707 | ||
708 | mov _RSP(%rsp), %rsp | |
709 | ||
710 | popq %r15 | |
711 | popq %r14 | |
712 | popq %r13 | |
713 | popq %r12 | |
d34a4600 TC |
714 | popq %rbx |
715 | ret | |
716 | ENDPROC(sha256_transform_rorx) | |
717 | ||
e183914a | 718 | .section .rodata.cst512.K256, "aM", @progbits, 512 |
d34a4600 TC |
719 | .align 64 |
720 | K256: | |
721 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | |
722 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | |
723 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
724 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
725 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | |
726 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | |
727 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
728 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
729 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | |
730 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | |
731 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
732 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
733 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | |
734 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | |
735 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
736 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
737 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | |
738 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | |
739 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | |
740 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | |
741 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | |
742 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | |
743 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
744 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
745 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | |
746 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | |
747 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
748 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
749 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | |
750 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | |
751 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
752 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
753 | ||
e183914a DV |
754 | .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 |
755 | .align 32 | |
d34a4600 TC |
756 | PSHUFFLE_BYTE_FLIP_MASK: |
757 | .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 | |
758 | ||
759 | # shuffle xBxA -> 00BA | |
e183914a DV |
760 | .section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 |
761 | .align 32 | |
d34a4600 TC |
762 | _SHUF_00BA: |
763 | .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 | |
764 | ||
765 | # shuffle xDxC -> DC00 | |
e183914a DV |
766 | .section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 |
767 | .align 32 | |
d34a4600 TC |
768 | _SHUF_DC00: |
769 | .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF | |
e183914a | 770 | |
d34a4600 | 771 | #endif |