Commit | Line | Data |
---|---|---|
2874c5fd | 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
54b6a1bd HY |
2 | /* |
3 | * Implement AES algorithm in Intel AES-NI instructions. | |
4 | * | |
5 | * The white paper of AES-NI instructions can be downloaded from: | |
6 | * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf | |
7 | * | |
8 | * Copyright (C) 2008, Intel Corp. | |
9 | * Author: Huang Ying <ying.huang@intel.com> | |
10 | * Vinodh Gopal <vinodh.gopal@intel.com> | |
11 | * Kahraman Akdemir | |
12 | * | |
0bd82f5f TS |
13 | * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD |
14 | * interface for 64-bit kernels. | |
15 | * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) | |
16 | * Aidan O'Mahony (aidan.o.mahony@intel.com) | |
17 | * Adrian Hoban <adrian.hoban@intel.com> | |
18 | * James Guilford (james.guilford@intel.com) | |
19 | * Gabriele Paoloni <gabriele.paoloni@intel.com> | |
20 | * Tadeusz Struk (tadeusz.struk@intel.com) | |
21 | * Wajdi Feghali (wajdi.k.feghali@intel.com) | |
22 | * Copyright (c) 2010, Intel Corporation. | |
23 | * | |
0d258efb MK |
24 | * Ported x86_64 version to x86: |
25 | * Author: Mathias Krause <minipli@googlemail.com> | |
54b6a1bd HY |
26 | */ |
27 | ||
28 | #include <linux/linkage.h> | |
8691ccd7 | 29 | #include <asm/frame.h> |
9697fa39 | 30 | #include <asm/nospec-branch.h> |
54b6a1bd | 31 | |
e31ac32d TM |
32 | /* |
33 | * The following macros are used to move an (un)aligned 16 byte value to/from | |
34 | * an XMM register. This can done for either FP or integer values, for FP use | |
35 | * movaps (move aligned packed single) or integer use movdqa (move double quad | |
36 | * aligned). It doesn't make a performance difference which instruction is used | |
37 | * since Nehalem (original Core i7) was released. However, the movaps is a byte | |
38 | * shorter, so that is the one we'll use for now. (same for unaligned). | |
39 | */ | |
40 | #define MOVADQ movaps | |
41 | #define MOVUDQ movups | |
42 | ||
559ad0ff | 43 | #ifdef __x86_64__ |
e31ac32d | 44 | |
e183914a | 45 | # constants in mergeable sections, linker can reorder and merge |
e183914a DV |
46 | .section .rodata.cst16.POLY, "aM", @progbits, 16 |
47 | .align 16 | |
0bd82f5f | 48 | POLY: .octa 0xC2000000000000000000000000000001 |
e183914a DV |
49 | .section .rodata.cst16.TWOONE, "aM", @progbits, 16 |
50 | .align 16 | |
0bd82f5f TS |
51 | TWOONE: .octa 0x00000001000000000000000000000001 |
52 | ||
e183914a DV |
53 | .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 |
54 | .align 16 | |
0bd82f5f | 55 | SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F |
e183914a DV |
56 | .section .rodata.cst16.MASK1, "aM", @progbits, 16 |
57 | .align 16 | |
0bd82f5f | 58 | MASK1: .octa 0x0000000000000000ffffffffffffffff |
e183914a DV |
59 | .section .rodata.cst16.MASK2, "aM", @progbits, 16 |
60 | .align 16 | |
0bd82f5f | 61 | MASK2: .octa 0xffffffffffffffff0000000000000000 |
e183914a DV |
62 | .section .rodata.cst16.ONE, "aM", @progbits, 16 |
63 | .align 16 | |
0bd82f5f | 64 | ONE: .octa 0x00000000000000000000000000000001 |
e183914a DV |
65 | .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 |
66 | .align 16 | |
0bd82f5f | 67 | F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 |
e183914a DV |
68 | .section .rodata.cst16.dec, "aM", @progbits, 16 |
69 | .align 16 | |
0bd82f5f | 70 | dec: .octa 0x1 |
e183914a DV |
71 | .section .rodata.cst16.enc, "aM", @progbits, 16 |
72 | .align 16 | |
0bd82f5f TS |
73 | enc: .octa 0x2 |
74 | ||
e183914a DV |
75 | # order of these constants should not change. |
76 | # more specifically, ALL_F should follow SHIFT_MASK, | |
77 | # and zero should follow ALL_F | |
78 | .section .rodata, "a", @progbits | |
79 | .align 16 | |
80 | SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 | |
81 | ALL_F: .octa 0xffffffffffffffffffffffffffffffff | |
82 | .octa 0x00000000000000000000000000000000 | |
83 | ||
54b6a1bd HY |
84 | .text |
85 | ||
0bd82f5f TS |
86 | |
87 | #define STACK_OFFSET 8*3 | |
0bd82f5f | 88 | |
9ee4a5df DW |
89 | #define AadHash 16*0 |
90 | #define AadLen 16*1 | |
91 | #define InLen (16*1)+8 | |
92 | #define PBlockEncKey 16*2 | |
93 | #define OrigIV 16*3 | |
94 | #define CurCount 16*4 | |
95 | #define PBlockLen 16*5 | |
1476db2d DW |
96 | #define HashKey 16*6 // store HashKey <<1 mod poly here |
97 | #define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here | |
98 | #define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here | |
99 | #define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here | |
100 | #define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 | |
101 | // bits of HashKey <<1 mod poly here | |
102 | //(for Karatsuba purposes) | |
103 | #define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 | |
104 | // bits of HashKey^2 <<1 mod poly here | |
105 | // (for Karatsuba purposes) | |
106 | #define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 | |
107 | // bits of HashKey^3 <<1 mod poly here | |
108 | // (for Karatsuba purposes) | |
109 | #define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 | |
110 | // bits of HashKey^4 <<1 mod poly here | |
111 | // (for Karatsuba purposes) | |
9ee4a5df | 112 | |
0bd82f5f TS |
113 | #define arg1 rdi |
114 | #define arg2 rsi | |
115 | #define arg3 rdx | |
116 | #define arg4 rcx | |
117 | #define arg5 r8 | |
118 | #define arg6 r9 | |
1476db2d DW |
119 | #define arg7 STACK_OFFSET+8(%rsp) |
120 | #define arg8 STACK_OFFSET+16(%rsp) | |
121 | #define arg9 STACK_OFFSET+24(%rsp) | |
122 | #define arg10 STACK_OFFSET+32(%rsp) | |
123 | #define arg11 STACK_OFFSET+40(%rsp) | |
e31ac32d | 124 | #define keysize 2*15*16(%arg1) |
559ad0ff | 125 | #endif |
0bd82f5f TS |
126 | |
127 | ||
54b6a1bd HY |
128 | #define STATE1 %xmm0 |
129 | #define STATE2 %xmm4 | |
130 | #define STATE3 %xmm5 | |
131 | #define STATE4 %xmm6 | |
132 | #define STATE STATE1 | |
133 | #define IN1 %xmm1 | |
134 | #define IN2 %xmm7 | |
135 | #define IN3 %xmm8 | |
136 | #define IN4 %xmm9 | |
137 | #define IN IN1 | |
138 | #define KEY %xmm2 | |
139 | #define IV %xmm3 | |
0d258efb | 140 | |
12387a46 HY |
141 | #define BSWAP_MASK %xmm10 |
142 | #define CTR %xmm11 | |
143 | #define INC %xmm12 | |
54b6a1bd | 144 | |
2481104f | 145 | #define GF128MUL_MASK %xmm7 |
c456a9cd | 146 | |
0d258efb MK |
147 | #ifdef __x86_64__ |
148 | #define AREG %rax | |
54b6a1bd HY |
149 | #define KEYP %rdi |
150 | #define OUTP %rsi | |
0d258efb | 151 | #define UKEYP OUTP |
54b6a1bd HY |
152 | #define INP %rdx |
153 | #define LEN %rcx | |
154 | #define IVP %r8 | |
155 | #define KLEN %r9d | |
156 | #define T1 %r10 | |
157 | #define TKEYP T1 | |
158 | #define T2 %r11 | |
12387a46 | 159 | #define TCTR_LOW T2 |
0d258efb MK |
160 | #else |
161 | #define AREG %eax | |
162 | #define KEYP %edi | |
163 | #define OUTP AREG | |
164 | #define UKEYP OUTP | |
165 | #define INP %edx | |
166 | #define LEN %esi | |
167 | #define IVP %ebp | |
168 | #define KLEN %ebx | |
169 | #define T1 %ecx | |
170 | #define TKEYP T1 | |
171 | #endif | |
54b6a1bd | 172 | |
6c2c86b3 DW |
173 | .macro FUNC_SAVE |
174 | push %r12 | |
175 | push %r13 | |
176 | push %r14 | |
6c2c86b3 DW |
177 | # |
178 | # states of %xmm registers %xmm6:%xmm15 not saved | |
179 | # all %xmm registers are clobbered | |
180 | # | |
6c2c86b3 DW |
181 | .endm |
182 | ||
183 | ||
184 | .macro FUNC_RESTORE | |
6c2c86b3 DW |
185 | pop %r14 |
186 | pop %r13 | |
187 | pop %r12 | |
188 | .endm | |
0bd82f5f | 189 | |
1476db2d DW |
190 | # Precompute hashkeys. |
191 | # Input: Hash subkey. | |
192 | # Output: HashKeys stored in gcm_context_data. Only needs to be called | |
193 | # once per key. | |
194 | # clobbers r12, and tmp xmm registers. | |
fb8986e6 DW |
195 | .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 |
196 | mov \SUBKEY, %r12 | |
1476db2d DW |
197 | movdqu (%r12), \TMP3 |
198 | movdqa SHUF_MASK(%rip), \TMP2 | |
d7866e50 | 199 | pshufb \TMP2, \TMP3 |
1476db2d DW |
200 | |
201 | # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) | |
202 | ||
203 | movdqa \TMP3, \TMP2 | |
204 | psllq $1, \TMP3 | |
205 | psrlq $63, \TMP2 | |
206 | movdqa \TMP2, \TMP1 | |
207 | pslldq $8, \TMP2 | |
208 | psrldq $8, \TMP1 | |
209 | por \TMP2, \TMP3 | |
210 | ||
211 | # reduce HashKey<<1 | |
212 | ||
213 | pshufd $0x24, \TMP1, \TMP2 | |
214 | pcmpeqd TWOONE(%rip), \TMP2 | |
215 | pand POLY(%rip), \TMP2 | |
216 | pxor \TMP2, \TMP3 | |
e5b954e8 | 217 | movdqu \TMP3, HashKey(%arg2) |
1476db2d DW |
218 | |
219 | movdqa \TMP3, \TMP5 | |
220 | pshufd $78, \TMP3, \TMP1 | |
221 | pxor \TMP3, \TMP1 | |
e5b954e8 | 222 | movdqu \TMP1, HashKey_k(%arg2) |
1476db2d DW |
223 | |
224 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | |
225 | # TMP5 = HashKey^2<<1 (mod poly) | |
e5b954e8 | 226 | movdqu \TMP5, HashKey_2(%arg2) |
1476db2d DW |
227 | # HashKey_2 = HashKey^2<<1 (mod poly) |
228 | pshufd $78, \TMP5, \TMP1 | |
229 | pxor \TMP5, \TMP1 | |
e5b954e8 | 230 | movdqu \TMP1, HashKey_2_k(%arg2) |
1476db2d DW |
231 | |
232 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | |
233 | # TMP5 = HashKey^3<<1 (mod poly) | |
e5b954e8 | 234 | movdqu \TMP5, HashKey_3(%arg2) |
1476db2d DW |
235 | pshufd $78, \TMP5, \TMP1 |
236 | pxor \TMP5, \TMP1 | |
e5b954e8 | 237 | movdqu \TMP1, HashKey_3_k(%arg2) |
1476db2d DW |
238 | |
239 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | |
240 | # TMP5 = HashKey^3<<1 (mod poly) | |
e5b954e8 | 241 | movdqu \TMP5, HashKey_4(%arg2) |
1476db2d DW |
242 | pshufd $78, \TMP5, \TMP1 |
243 | pxor \TMP5, \TMP1 | |
e5b954e8 | 244 | movdqu \TMP1, HashKey_4_k(%arg2) |
1476db2d | 245 | .endm |
7af964c2 DW |
246 | |
247 | # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. | |
248 | # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 | |
fb8986e6 DW |
249 | .macro GCM_INIT Iv SUBKEY AAD AADLEN |
250 | mov \AADLEN, %r11 | |
9660474b | 251 | mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length |
a7bea830 | 252 | xor %r11d, %r11d |
9660474b DW |
253 | mov %r11, InLen(%arg2) # ctx_data.in_length = 0 |
254 | mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 | |
255 | mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 | |
fb8986e6 | 256 | mov \Iv, %rax |
9660474b DW |
257 | movdqu (%rax), %xmm0 |
258 | movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv | |
259 | ||
260 | movdqa SHUF_MASK(%rip), %xmm2 | |
d7866e50 | 261 | pshufb %xmm2, %xmm0 |
9660474b DW |
262 | movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv |
263 | ||
3347c8a0 | 264 | PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 |
e5b954e8 | 265 | movdqu HashKey(%arg2), %xmm13 |
c594c540 | 266 | |
fb8986e6 DW |
267 | CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ |
268 | %xmm4, %xmm5, %xmm6 | |
7af964c2 DW |
269 | .endm |
270 | ||
ba45833e DW |
271 | # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context |
272 | # struct has been initialized by GCM_INIT. | |
273 | # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK | |
274 | # Clobbers rax, r10-r13, and xmm0-xmm15 | |
275 | .macro GCM_ENC_DEC operation | |
9660474b | 276 | movdqu AadHash(%arg2), %xmm8 |
1476db2d | 277 | movdqu HashKey(%arg2), %xmm13 |
9660474b | 278 | add %arg5, InLen(%arg2) |
ae952c5e | 279 | |
a7bea830 | 280 | xor %r11d, %r11d # initialise the data pointer offset as zero |
ae952c5e DW |
281 | PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation |
282 | ||
283 | sub %r11, %arg5 # sub partial block data used | |
9660474b | 284 | mov %arg5, %r13 # save the number of bytes |
ae952c5e | 285 | |
9660474b DW |
286 | and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) |
287 | mov %r13, %r12 | |
ba45833e DW |
288 | # Encrypt/Decrypt first few blocks |
289 | ||
290 | and $(3<<4), %r12 | |
291 | jz _initial_num_blocks_is_0_\@ | |
292 | cmp $(2<<4), %r12 | |
293 | jb _initial_num_blocks_is_1_\@ | |
294 | je _initial_num_blocks_is_2_\@ | |
295 | _initial_num_blocks_is_3_\@: | |
296 | INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | |
297 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation | |
298 | sub $48, %r13 | |
299 | jmp _initial_blocks_\@ | |
300 | _initial_num_blocks_is_2_\@: | |
301 | INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | |
302 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation | |
303 | sub $32, %r13 | |
304 | jmp _initial_blocks_\@ | |
305 | _initial_num_blocks_is_1_\@: | |
306 | INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | |
307 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation | |
308 | sub $16, %r13 | |
309 | jmp _initial_blocks_\@ | |
310 | _initial_num_blocks_is_0_\@: | |
311 | INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | |
312 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation | |
313 | _initial_blocks_\@: | |
314 | ||
315 | # Main loop - Encrypt/Decrypt remaining blocks | |
316 | ||
032d049e | 317 | test %r13, %r13 |
ba45833e DW |
318 | je _zero_cipher_left_\@ |
319 | sub $64, %r13 | |
320 | je _four_cipher_left_\@ | |
321 | _crypt_by_4_\@: | |
322 | GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ | |
323 | %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ | |
324 | %xmm7, %xmm8, enc | |
325 | add $64, %r11 | |
326 | sub $64, %r13 | |
327 | jne _crypt_by_4_\@ | |
328 | _four_cipher_left_\@: | |
329 | GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ | |
330 | %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 | |
331 | _zero_cipher_left_\@: | |
9660474b DW |
332 | movdqu %xmm8, AadHash(%arg2) |
333 | movdqu %xmm0, CurCount(%arg2) | |
334 | ||
9ee4a5df DW |
335 | mov %arg5, %r13 |
336 | and $15, %r13 # %r13 = arg5 (mod 16) | |
ba45833e DW |
337 | je _multiple_of_16_bytes_\@ |
338 | ||
9660474b DW |
339 | mov %r13, PBlockLen(%arg2) |
340 | ||
ba45833e DW |
341 | # Handle the last <16 Byte block separately |
342 | paddd ONE(%rip), %xmm0 # INCR CNT to get Yn | |
9660474b | 343 | movdqu %xmm0, CurCount(%arg2) |
9ee4a5df | 344 | movdqa SHUF_MASK(%rip), %xmm10 |
d7866e50 | 345 | pshufb %xmm10, %xmm0 |
ba45833e DW |
346 | |
347 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) | |
9660474b | 348 | movdqu %xmm0, PBlockEncKey(%arg2) |
ba45833e | 349 | |
933d6aef DW |
350 | cmp $16, %arg5 |
351 | jge _large_enough_update_\@ | |
352 | ||
9ee4a5df | 353 | lea (%arg4,%r11,1), %r10 |
ba45833e DW |
354 | mov %r13, %r12 |
355 | READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 | |
933d6aef DW |
356 | jmp _data_read_\@ |
357 | ||
358 | _large_enough_update_\@: | |
359 | sub $16, %r11 | |
360 | add %r13, %r11 | |
361 | ||
362 | # receive the last <16 Byte block | |
363 | movdqu (%arg4, %r11, 1), %xmm1 | |
ba45833e | 364 | |
933d6aef DW |
365 | sub %r13, %r11 |
366 | add $16, %r11 | |
367 | ||
368 | lea SHIFT_MASK+16(%rip), %r12 | |
369 | # adjust the shuffle mask pointer to be able to shift 16-r13 bytes | |
370 | # (r13 is the number of bytes in plaintext mod 16) | |
371 | sub %r13, %r12 | |
372 | # get the appropriate shuffle mask | |
373 | movdqu (%r12), %xmm2 | |
374 | # shift right 16-r13 bytes | |
d7866e50 | 375 | pshufb %xmm2, %xmm1 |
933d6aef DW |
376 | |
377 | _data_read_\@: | |
ba45833e DW |
378 | lea ALL_F+16(%rip), %r12 |
379 | sub %r13, %r12 | |
933d6aef | 380 | |
ba45833e DW |
381 | .ifc \operation, dec |
382 | movdqa %xmm1, %xmm2 | |
383 | .endif | |
384 | pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) | |
385 | movdqu (%r12), %xmm1 | |
386 | # get the appropriate mask to mask out top 16-r13 bytes of xmm0 | |
387 | pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 | |
388 | .ifc \operation, dec | |
389 | pand %xmm1, %xmm2 | |
390 | movdqa SHUF_MASK(%rip), %xmm10 | |
d7866e50 | 391 | pshufb %xmm10 ,%xmm2 |
ba45833e DW |
392 | |
393 | pxor %xmm2, %xmm8 | |
394 | .else | |
395 | movdqa SHUF_MASK(%rip), %xmm10 | |
d7866e50 | 396 | pshufb %xmm10,%xmm0 |
ba45833e DW |
397 | |
398 | pxor %xmm0, %xmm8 | |
399 | .endif | |
400 | ||
9660474b | 401 | movdqu %xmm8, AadHash(%arg2) |
ba45833e DW |
402 | .ifc \operation, enc |
403 | # GHASH computation for the last <16 byte block | |
404 | movdqa SHUF_MASK(%rip), %xmm10 | |
405 | # shuffle xmm0 back to output as ciphertext | |
d7866e50 | 406 | pshufb %xmm10, %xmm0 |
ba45833e DW |
407 | .endif |
408 | ||
409 | # Output %r13 bytes | |
d7866e50 | 410 | movq %xmm0, %rax |
ba45833e DW |
411 | cmp $8, %r13 |
412 | jle _less_than_8_bytes_left_\@ | |
9ee4a5df | 413 | mov %rax, (%arg3 , %r11, 1) |
ba45833e DW |
414 | add $8, %r11 |
415 | psrldq $8, %xmm0 | |
d7866e50 | 416 | movq %xmm0, %rax |
ba45833e DW |
417 | sub $8, %r13 |
418 | _less_than_8_bytes_left_\@: | |
9ee4a5df | 419 | mov %al, (%arg3, %r11, 1) |
ba45833e DW |
420 | add $1, %r11 |
421 | shr $8, %rax | |
422 | sub $1, %r13 | |
423 | jne _less_than_8_bytes_left_\@ | |
424 | _multiple_of_16_bytes_\@: | |
425 | .endm | |
426 | ||
adcadab3 DW |
427 | # GCM_COMPLETE Finishes update of tag of last partial block |
428 | # Output: Authorization Tag (AUTH_TAG) | |
429 | # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 | |
fb8986e6 | 430 | .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN |
9660474b | 431 | movdqu AadHash(%arg2), %xmm8 |
1476db2d | 432 | movdqu HashKey(%arg2), %xmm13 |
e2e34b08 DW |
433 | |
434 | mov PBlockLen(%arg2), %r12 | |
435 | ||
032d049e | 436 | test %r12, %r12 |
e2e34b08 DW |
437 | je _partial_done\@ |
438 | ||
439 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | |
440 | ||
441 | _partial_done\@: | |
9660474b | 442 | mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) |
adcadab3 DW |
443 | shl $3, %r12 # convert into number of bits |
444 | movd %r12d, %xmm15 # len(A) in %xmm15 | |
9660474b DW |
445 | mov InLen(%arg2), %r12 |
446 | shl $3, %r12 # len(C) in bits (*128) | |
d7866e50 | 447 | movq %r12, %xmm1 |
9660474b | 448 | |
adcadab3 DW |
449 | pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 |
450 | pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) | |
451 | pxor %xmm15, %xmm8 | |
452 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | |
453 | # final GHASH computation | |
454 | movdqa SHUF_MASK(%rip), %xmm10 | |
d7866e50 | 455 | pshufb %xmm10, %xmm8 |
adcadab3 | 456 | |
9660474b | 457 | movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 |
adcadab3 DW |
458 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) |
459 | pxor %xmm8, %xmm0 | |
460 | _return_T_\@: | |
fb8986e6 DW |
461 | mov \AUTHTAG, %r10 # %r10 = authTag |
462 | mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len | |
adcadab3 DW |
463 | cmp $16, %r11 |
464 | je _T_16_\@ | |
465 | cmp $8, %r11 | |
466 | jl _T_4_\@ | |
467 | _T_8_\@: | |
d7866e50 | 468 | movq %xmm0, %rax |
adcadab3 DW |
469 | mov %rax, (%r10) |
470 | add $8, %r10 | |
471 | sub $8, %r11 | |
472 | psrldq $8, %xmm0 | |
032d049e | 473 | test %r11, %r11 |
adcadab3 DW |
474 | je _return_T_done_\@ |
475 | _T_4_\@: | |
476 | movd %xmm0, %eax | |
477 | mov %eax, (%r10) | |
478 | add $4, %r10 | |
479 | sub $4, %r11 | |
480 | psrldq $4, %xmm0 | |
032d049e | 481 | test %r11, %r11 |
adcadab3 DW |
482 | je _return_T_done_\@ |
483 | _T_123_\@: | |
484 | movd %xmm0, %eax | |
485 | cmp $2, %r11 | |
486 | jl _T_1_\@ | |
487 | mov %ax, (%r10) | |
488 | cmp $2, %r11 | |
489 | je _return_T_done_\@ | |
490 | add $2, %r10 | |
491 | sar $16, %eax | |
492 | _T_1_\@: | |
493 | mov %al, (%r10) | |
494 | jmp _return_T_done_\@ | |
495 | _T_16_\@: | |
496 | movdqu %xmm0, (%r10) | |
497 | _return_T_done_\@: | |
498 | .endm | |
499 | ||
559ad0ff | 500 | #ifdef __x86_64__ |
0bd82f5f TS |
501 | /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) |
502 | * | |
503 | * | |
504 | * Input: A and B (128-bits each, bit-reflected) | |
505 | * Output: C = A*B*x mod poly, (i.e. >>1 ) | |
506 | * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input | |
507 | * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. | |
508 | * | |
509 | */ | |
510 | .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 | |
511 | movdqa \GH, \TMP1 | |
512 | pshufd $78, \GH, \TMP2 | |
513 | pshufd $78, \HK, \TMP3 | |
514 | pxor \GH, \TMP2 # TMP2 = a1+a0 | |
515 | pxor \HK, \TMP3 # TMP3 = b1+b0 | |
d7866e50 UB |
516 | pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 |
517 | pclmulqdq $0x00, \HK, \GH # GH = a0*b0 | |
518 | pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) | |
0bd82f5f TS |
519 | pxor \GH, \TMP2 |
520 | pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) | |
521 | movdqa \TMP2, \TMP3 | |
522 | pslldq $8, \TMP3 # left shift TMP3 2 DWs | |
523 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | |
524 | pxor \TMP3, \GH | |
525 | pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK | |
526 | ||
527 | # first phase of the reduction | |
528 | ||
529 | movdqa \GH, \TMP2 | |
530 | movdqa \GH, \TMP3 | |
531 | movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 | |
532 | # in in order to perform | |
533 | # independent shifts | |
534 | pslld $31, \TMP2 # packed right shift <<31 | |
535 | pslld $30, \TMP3 # packed right shift <<30 | |
536 | pslld $25, \TMP4 # packed right shift <<25 | |
537 | pxor \TMP3, \TMP2 # xor the shifted versions | |
538 | pxor \TMP4, \TMP2 | |
539 | movdqa \TMP2, \TMP5 | |
540 | psrldq $4, \TMP5 # right shift TMP5 1 DW | |
541 | pslldq $12, \TMP2 # left shift TMP2 3 DWs | |
542 | pxor \TMP2, \GH | |
543 | ||
544 | # second phase of the reduction | |
545 | ||
546 | movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 | |
547 | # in in order to perform | |
548 | # independent shifts | |
549 | movdqa \GH,\TMP3 | |
550 | movdqa \GH,\TMP4 | |
551 | psrld $1,\TMP2 # packed left shift >>1 | |
552 | psrld $2,\TMP3 # packed left shift >>2 | |
553 | psrld $7,\TMP4 # packed left shift >>7 | |
554 | pxor \TMP3,\TMP2 # xor the shifted versions | |
555 | pxor \TMP4,\TMP2 | |
556 | pxor \TMP5, \TMP2 | |
557 | pxor \TMP2, \GH | |
558 | pxor \TMP1, \GH # result is in TMP1 | |
559 | .endm | |
560 | ||
b20209c9 JS |
561 | # Reads DLEN bytes starting at DPTR and stores in XMMDst |
562 | # where 0 < DLEN < 16 | |
563 | # Clobbers %rax, DLEN and XMM1 | |
564 | .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst | |
565 | cmp $8, \DLEN | |
566 | jl _read_lt8_\@ | |
567 | mov (\DPTR), %rax | |
d7866e50 | 568 | movq %rax, \XMMDst |
b20209c9 JS |
569 | sub $8, \DLEN |
570 | jz _done_read_partial_block_\@ | |
571 | xor %eax, %eax | |
572 | _read_next_byte_\@: | |
573 | shl $8, %rax | |
574 | mov 7(\DPTR, \DLEN, 1), %al | |
575 | dec \DLEN | |
576 | jnz _read_next_byte_\@ | |
d7866e50 | 577 | movq %rax, \XMM1 |
b20209c9 JS |
578 | pslldq $8, \XMM1 |
579 | por \XMM1, \XMMDst | |
580 | jmp _done_read_partial_block_\@ | |
581 | _read_lt8_\@: | |
582 | xor %eax, %eax | |
583 | _read_next_byte_lt8_\@: | |
584 | shl $8, %rax | |
585 | mov -1(\DPTR, \DLEN, 1), %al | |
586 | dec \DLEN | |
587 | jnz _read_next_byte_lt8_\@ | |
d7866e50 | 588 | movq %rax, \XMMDst |
b20209c9 JS |
589 | _done_read_partial_block_\@: |
590 | .endm | |
591 | ||
c594c540 DW |
592 | # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. |
593 | # clobbers r10-11, xmm14 | |
fb8986e6 | 594 | .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ |
c594c540 DW |
595 | TMP6 TMP7 |
596 | MOVADQ SHUF_MASK(%rip), %xmm14 | |
fb8986e6 DW |
597 | mov \AAD, %r10 # %r10 = AAD |
598 | mov \AADLEN, %r11 # %r11 = aadLen | |
c594c540 DW |
599 | pxor \TMP7, \TMP7 |
600 | pxor \TMP6, \TMP6 | |
0487ccac SD |
601 | |
602 | cmp $16, %r11 | |
e1fd316f DW |
603 | jl _get_AAD_rest\@ |
604 | _get_AAD_blocks\@: | |
c594c540 | 605 | movdqu (%r10), \TMP7 |
d7866e50 | 606 | pshufb %xmm14, \TMP7 # byte-reflect the AAD data |
c594c540 DW |
607 | pxor \TMP7, \TMP6 |
608 | GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 | |
0487ccac | 609 | add $16, %r10 |
0487ccac SD |
610 | sub $16, %r11 |
611 | cmp $16, %r11 | |
e1fd316f | 612 | jge _get_AAD_blocks\@ |
0487ccac | 613 | |
c594c540 | 614 | movdqu \TMP6, \TMP7 |
1ecdd37e JS |
615 | |
616 | /* read the last <16B of AAD */ | |
e1fd316f | 617 | _get_AAD_rest\@: |
032d049e | 618 | test %r11, %r11 |
e1fd316f | 619 | je _get_AAD_done\@ |
0487ccac | 620 | |
c594c540 | 621 | READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 |
d7866e50 | 622 | pshufb %xmm14, \TMP7 # byte-reflect the AAD data |
c594c540 DW |
623 | pxor \TMP6, \TMP7 |
624 | GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 | |
625 | movdqu \TMP7, \TMP6 | |
3c097b80 | 626 | |
e1fd316f | 627 | _get_AAD_done\@: |
c594c540 DW |
628 | movdqu \TMP6, AadHash(%arg2) |
629 | .endm | |
630 | ||
ae952c5e DW |
631 | # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks |
632 | # between update calls. | |
633 | # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK | |
634 | # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context | |
635 | # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 | |
636 | .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ | |
637 | AAD_HASH operation | |
638 | mov PBlockLen(%arg2), %r13 | |
032d049e | 639 | test %r13, %r13 |
ae952c5e DW |
640 | je _partial_block_done_\@ # Leave Macro if no partial blocks |
641 | # Read in input data without over reading | |
642 | cmp $16, \PLAIN_CYPH_LEN | |
643 | jl _fewer_than_16_bytes_\@ | |
644 | movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm | |
645 | jmp _data_read_\@ | |
646 | ||
647 | _fewer_than_16_bytes_\@: | |
648 | lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 | |
649 | mov \PLAIN_CYPH_LEN, %r12 | |
650 | READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 | |
651 | ||
652 | mov PBlockLen(%arg2), %r13 | |
653 | ||
654 | _data_read_\@: # Finished reading in data | |
655 | ||
656 | movdqu PBlockEncKey(%arg2), %xmm9 | |
657 | movdqu HashKey(%arg2), %xmm13 | |
658 | ||
659 | lea SHIFT_MASK(%rip), %r12 | |
660 | ||
661 | # adjust the shuffle mask pointer to be able to shift r13 bytes | |
662 | # r16-r13 is the number of bytes in plaintext mod 16) | |
663 | add %r13, %r12 | |
664 | movdqu (%r12), %xmm2 # get the appropriate shuffle mask | |
d7866e50 | 665 | pshufb %xmm2, %xmm9 # shift right r13 bytes |
ae952c5e DW |
666 | |
667 | .ifc \operation, dec | |
668 | movdqa %xmm1, %xmm3 | |
669 | pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) | |
670 | ||
671 | mov \PLAIN_CYPH_LEN, %r10 | |
672 | add %r13, %r10 | |
673 | # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling | |
674 | sub $16, %r10 | |
675 | # Determine if if partial block is not being filled and | |
676 | # shift mask accordingly | |
677 | jge _no_extra_mask_1_\@ | |
678 | sub %r10, %r12 | |
679 | _no_extra_mask_1_\@: | |
680 | ||
681 | movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 | |
682 | # get the appropriate mask to mask out bottom r13 bytes of xmm9 | |
683 | pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 | |
684 | ||
685 | pand %xmm1, %xmm3 | |
686 | movdqa SHUF_MASK(%rip), %xmm10 | |
d7866e50 UB |
687 | pshufb %xmm10, %xmm3 |
688 | pshufb %xmm2, %xmm3 | |
ae952c5e DW |
689 | pxor %xmm3, \AAD_HASH |
690 | ||
032d049e | 691 | test %r10, %r10 |
ae952c5e DW |
692 | jl _partial_incomplete_1_\@ |
693 | ||
694 | # GHASH computation for the last <16 Byte block | |
695 | GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 | |
a7bea830 | 696 | xor %eax, %eax |
ae952c5e DW |
697 | |
698 | mov %rax, PBlockLen(%arg2) | |
699 | jmp _dec_done_\@ | |
700 | _partial_incomplete_1_\@: | |
701 | add \PLAIN_CYPH_LEN, PBlockLen(%arg2) | |
702 | _dec_done_\@: | |
703 | movdqu \AAD_HASH, AadHash(%arg2) | |
704 | .else | |
705 | pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) | |
706 | ||
707 | mov \PLAIN_CYPH_LEN, %r10 | |
708 | add %r13, %r10 | |
709 | # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling | |
710 | sub $16, %r10 | |
711 | # Determine if if partial block is not being filled and | |
712 | # shift mask accordingly | |
713 | jge _no_extra_mask_2_\@ | |
714 | sub %r10, %r12 | |
715 | _no_extra_mask_2_\@: | |
716 | ||
717 | movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 | |
718 | # get the appropriate mask to mask out bottom r13 bytes of xmm9 | |
719 | pand %xmm1, %xmm9 | |
720 | ||
721 | movdqa SHUF_MASK(%rip), %xmm1 | |
d7866e50 UB |
722 | pshufb %xmm1, %xmm9 |
723 | pshufb %xmm2, %xmm9 | |
ae952c5e DW |
724 | pxor %xmm9, \AAD_HASH |
725 | ||
032d049e | 726 | test %r10, %r10 |
ae952c5e DW |
727 | jl _partial_incomplete_2_\@ |
728 | ||
729 | # GHASH computation for the last <16 Byte block | |
730 | GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 | |
a7bea830 | 731 | xor %eax, %eax |
ae952c5e DW |
732 | |
733 | mov %rax, PBlockLen(%arg2) | |
734 | jmp _encode_done_\@ | |
735 | _partial_incomplete_2_\@: | |
736 | add \PLAIN_CYPH_LEN, PBlockLen(%arg2) | |
737 | _encode_done_\@: | |
738 | movdqu \AAD_HASH, AadHash(%arg2) | |
739 | ||
740 | movdqa SHUF_MASK(%rip), %xmm10 | |
741 | # shuffle xmm9 back to output as ciphertext | |
d7866e50 UB |
742 | pshufb %xmm10, %xmm9 |
743 | pshufb %xmm2, %xmm9 | |
ae952c5e DW |
744 | .endif |
745 | # output encrypted Bytes | |
032d049e | 746 | test %r10, %r10 |
ae952c5e DW |
747 | jl _partial_fill_\@ |
748 | mov %r13, %r12 | |
749 | mov $16, %r13 | |
750 | # Set r13 to be the number of bytes to write out | |
751 | sub %r12, %r13 | |
752 | jmp _count_set_\@ | |
753 | _partial_fill_\@: | |
754 | mov \PLAIN_CYPH_LEN, %r13 | |
755 | _count_set_\@: | |
756 | movdqa %xmm9, %xmm0 | |
d7866e50 | 757 | movq %xmm0, %rax |
ae952c5e DW |
758 | cmp $8, %r13 |
759 | jle _less_than_8_bytes_left_\@ | |
760 | ||
761 | mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) | |
762 | add $8, \DATA_OFFSET | |
763 | psrldq $8, %xmm0 | |
d7866e50 | 764 | movq %xmm0, %rax |
ae952c5e DW |
765 | sub $8, %r13 |
766 | _less_than_8_bytes_left_\@: | |
767 | movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) | |
768 | add $1, \DATA_OFFSET | |
769 | shr $8, %rax | |
770 | sub $1, %r13 | |
771 | jne _less_than_8_bytes_left_\@ | |
772 | _partial_block_done_\@: | |
773 | .endm # PARTIAL_BLOCK | |
774 | ||
c594c540 DW |
775 | /* |
776 | * if a = number of total plaintext bytes | |
777 | * b = floor(a/16) | |
778 | * num_initial_blocks = b mod 4 | |
779 | * encrypt the initial num_initial_blocks blocks and apply ghash on | |
780 | * the ciphertext | |
781 | * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers | |
782 | * are clobbered | |
1476db2d | 783 | * arg1, %arg2, %arg3 are used as a pointer only, not modified |
c594c540 DW |
784 | */ |
785 | ||
786 | ||
787 | .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ | |
788 | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation | |
9660474b | 789 | MOVADQ SHUF_MASK(%rip), %xmm14 |
c594c540 DW |
790 | |
791 | movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 | |
792 | ||
0487ccac | 793 | # start AES for num_initial_blocks blocks |
3c097b80 | 794 | |
9660474b | 795 | movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 |
3c097b80 TS |
796 | |
797 | .if (\i == 5) || (\i == 6) || (\i == 7) | |
3c097b80 | 798 | |
e31ac32d TM |
799 | MOVADQ ONE(%RIP),\TMP1 |
800 | MOVADQ 0(%arg1),\TMP2 | |
3c097b80 | 801 | .irpc index, \i_seq |
e31ac32d | 802 | paddd \TMP1, \XMM0 # INCR Y0 |
e1fd316f DW |
803 | .ifc \operation, dec |
804 | movdqa \XMM0, %xmm\index | |
805 | .else | |
e31ac32d | 806 | MOVADQ \XMM0, %xmm\index |
e1fd316f | 807 | .endif |
d7866e50 | 808 | pshufb %xmm14, %xmm\index # perform a 16 byte swap |
e31ac32d | 809 | pxor \TMP2, %xmm\index |
3c097b80 | 810 | .endr |
e31ac32d TM |
811 | lea 0x10(%arg1),%r10 |
812 | mov keysize,%eax | |
813 | shr $2,%eax # 128->4, 192->6, 256->8 | |
814 | add $5,%eax # 128->9, 192->11, 256->13 | |
815 | ||
e1fd316f | 816 | aes_loop_initial_\@: |
e31ac32d TM |
817 | MOVADQ (%r10),\TMP1 |
818 | .irpc index, \i_seq | |
d7866e50 | 819 | aesenc \TMP1, %xmm\index |
3c097b80 | 820 | .endr |
e31ac32d TM |
821 | add $16,%r10 |
822 | sub $1,%eax | |
e1fd316f | 823 | jnz aes_loop_initial_\@ |
e31ac32d TM |
824 | |
825 | MOVADQ (%r10), \TMP1 | |
3c097b80 | 826 | .irpc index, \i_seq |
d7866e50 | 827 | aesenclast \TMP1, %xmm\index # Last Round |
3c097b80 TS |
828 | .endr |
829 | .irpc index, \i_seq | |
9ee4a5df | 830 | movdqu (%arg4 , %r11, 1), \TMP1 |
3c097b80 | 831 | pxor \TMP1, %xmm\index |
9ee4a5df | 832 | movdqu %xmm\index, (%arg3 , %r11, 1) |
3c097b80 TS |
833 | # write back plaintext/ciphertext for num_initial_blocks |
834 | add $16, %r11 | |
e1fd316f DW |
835 | |
836 | .ifc \operation, dec | |
837 | movdqa \TMP1, %xmm\index | |
838 | .endif | |
d7866e50 | 839 | pshufb %xmm14, %xmm\index |
3c097b80 TS |
840 | |
841 | # prepare plaintext/ciphertext for GHASH computation | |
842 | .endr | |
843 | .endif | |
0487ccac | 844 | |
3c097b80 TS |
845 | # apply GHASH on num_initial_blocks blocks |
846 | ||
847 | .if \i == 5 | |
848 | pxor %xmm5, %xmm6 | |
849 | GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | |
850 | pxor %xmm6, %xmm7 | |
851 | GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | |
852 | pxor %xmm7, %xmm8 | |
853 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | |
854 | .elseif \i == 6 | |
855 | pxor %xmm6, %xmm7 | |
856 | GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | |
857 | pxor %xmm7, %xmm8 | |
858 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | |
859 | .elseif \i == 7 | |
860 | pxor %xmm7, %xmm8 | |
861 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | |
862 | .endif | |
863 | cmp $64, %r13 | |
e1fd316f | 864 | jl _initial_blocks_done\@ |
3c097b80 TS |
865 | # no need for precomputed values |
866 | /* | |
867 | * | |
868 | * Precomputations for HashKey parallel with encryption of first 4 blocks. | |
869 | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | |
870 | */ | |
e31ac32d TM |
871 | MOVADQ ONE(%RIP),\TMP1 |
872 | paddd \TMP1, \XMM0 # INCR Y0 | |
873 | MOVADQ \XMM0, \XMM1 | |
d7866e50 | 874 | pshufb %xmm14, \XMM1 # perform a 16 byte swap |
3c097b80 | 875 | |
e31ac32d TM |
876 | paddd \TMP1, \XMM0 # INCR Y0 |
877 | MOVADQ \XMM0, \XMM2 | |
d7866e50 | 878 | pshufb %xmm14, \XMM2 # perform a 16 byte swap |
3c097b80 | 879 | |
e31ac32d TM |
880 | paddd \TMP1, \XMM0 # INCR Y0 |
881 | MOVADQ \XMM0, \XMM3 | |
d7866e50 | 882 | pshufb %xmm14, \XMM3 # perform a 16 byte swap |
3c097b80 | 883 | |
e31ac32d TM |
884 | paddd \TMP1, \XMM0 # INCR Y0 |
885 | MOVADQ \XMM0, \XMM4 | |
d7866e50 | 886 | pshufb %xmm14, \XMM4 # perform a 16 byte swap |
3c097b80 | 887 | |
e31ac32d TM |
888 | MOVADQ 0(%arg1),\TMP1 |
889 | pxor \TMP1, \XMM1 | |
890 | pxor \TMP1, \XMM2 | |
891 | pxor \TMP1, \XMM3 | |
892 | pxor \TMP1, \XMM4 | |
3c097b80 TS |
893 | .irpc index, 1234 # do 4 rounds |
894 | movaps 0x10*\index(%arg1), \TMP1 | |
d7866e50 UB |
895 | aesenc \TMP1, \XMM1 |
896 | aesenc \TMP1, \XMM2 | |
897 | aesenc \TMP1, \XMM3 | |
898 | aesenc \TMP1, \XMM4 | |
3c097b80 | 899 | .endr |
3c097b80 TS |
900 | .irpc index, 56789 # do next 5 rounds |
901 | movaps 0x10*\index(%arg1), \TMP1 | |
d7866e50 UB |
902 | aesenc \TMP1, \XMM1 |
903 | aesenc \TMP1, \XMM2 | |
904 | aesenc \TMP1, \XMM3 | |
905 | aesenc \TMP1, \XMM4 | |
3c097b80 | 906 | .endr |
e31ac32d TM |
907 | lea 0xa0(%arg1),%r10 |
908 | mov keysize,%eax | |
909 | shr $2,%eax # 128->4, 192->6, 256->8 | |
910 | sub $4,%eax # 128->0, 192->2, 256->4 | |
e1fd316f | 911 | jz aes_loop_pre_done\@ |
e31ac32d | 912 | |
e1fd316f | 913 | aes_loop_pre_\@: |
e31ac32d TM |
914 | MOVADQ (%r10),\TMP2 |
915 | .irpc index, 1234 | |
d7866e50 | 916 | aesenc \TMP2, %xmm\index |
e31ac32d TM |
917 | .endr |
918 | add $16,%r10 | |
919 | sub $1,%eax | |
e1fd316f | 920 | jnz aes_loop_pre_\@ |
e31ac32d | 921 | |
e1fd316f | 922 | aes_loop_pre_done\@: |
e31ac32d | 923 | MOVADQ (%r10), \TMP2 |
d7866e50 UB |
924 | aesenclast \TMP2, \XMM1 |
925 | aesenclast \TMP2, \XMM2 | |
926 | aesenclast \TMP2, \XMM3 | |
927 | aesenclast \TMP2, \XMM4 | |
9ee4a5df | 928 | movdqu 16*0(%arg4 , %r11 , 1), \TMP1 |
3c097b80 | 929 | pxor \TMP1, \XMM1 |
e1fd316f | 930 | .ifc \operation, dec |
9ee4a5df | 931 | movdqu \XMM1, 16*0(%arg3 , %r11 , 1) |
e1fd316f DW |
932 | movdqa \TMP1, \XMM1 |
933 | .endif | |
9ee4a5df | 934 | movdqu 16*1(%arg4 , %r11 , 1), \TMP1 |
3c097b80 | 935 | pxor \TMP1, \XMM2 |
e1fd316f | 936 | .ifc \operation, dec |
9ee4a5df | 937 | movdqu \XMM2, 16*1(%arg3 , %r11 , 1) |
e1fd316f DW |
938 | movdqa \TMP1, \XMM2 |
939 | .endif | |
9ee4a5df | 940 | movdqu 16*2(%arg4 , %r11 , 1), \TMP1 |
3c097b80 | 941 | pxor \TMP1, \XMM3 |
e1fd316f | 942 | .ifc \operation, dec |
9ee4a5df | 943 | movdqu \XMM3, 16*2(%arg3 , %r11 , 1) |
e1fd316f DW |
944 | movdqa \TMP1, \XMM3 |
945 | .endif | |
9ee4a5df | 946 | movdqu 16*3(%arg4 , %r11 , 1), \TMP1 |
3c097b80 | 947 | pxor \TMP1, \XMM4 |
e1fd316f | 948 | .ifc \operation, dec |
9ee4a5df | 949 | movdqu \XMM4, 16*3(%arg3 , %r11 , 1) |
e1fd316f DW |
950 | movdqa \TMP1, \XMM4 |
951 | .else | |
9ee4a5df DW |
952 | movdqu \XMM1, 16*0(%arg3 , %r11 , 1) |
953 | movdqu \XMM2, 16*1(%arg3 , %r11 , 1) | |
954 | movdqu \XMM3, 16*2(%arg3 , %r11 , 1) | |
955 | movdqu \XMM4, 16*3(%arg3 , %r11 , 1) | |
e1fd316f | 956 | .endif |
3c097b80 | 957 | |
0bd82f5f | 958 | add $64, %r11 |
d7866e50 | 959 | pshufb %xmm14, \XMM1 # perform a 16 byte swap |
0bd82f5f TS |
960 | pxor \XMMDst, \XMM1 |
961 | # combine GHASHed value with the corresponding ciphertext | |
d7866e50 UB |
962 | pshufb %xmm14, \XMM2 # perform a 16 byte swap |
963 | pshufb %xmm14, \XMM3 # perform a 16 byte swap | |
964 | pshufb %xmm14, \XMM4 # perform a 16 byte swap | |
3c097b80 | 965 | |
e1fd316f | 966 | _initial_blocks_done\@: |
3c097b80 | 967 | |
0bd82f5f TS |
968 | .endm |
969 | ||
970 | /* | |
971 | * encrypt 4 blocks at a time | |
972 | * ghash the 4 previously encrypted ciphertext blocks | |
9ee4a5df | 973 | * arg1, %arg3, %arg4 are used as pointers only, not modified |
0bd82f5f TS |
974 | * %r11 is the data offset value |
975 | */ | |
3347c8a0 | 976 | .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ |
3c097b80 TS |
977 | TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation |
978 | ||
979 | movdqa \XMM1, \XMM5 | |
980 | movdqa \XMM2, \XMM6 | |
981 | movdqa \XMM3, \XMM7 | |
982 | movdqa \XMM4, \XMM8 | |
983 | ||
984 | movdqa SHUF_MASK(%rip), %xmm15 | |
985 | # multiply TMP5 * HashKey using karatsuba | |
986 | ||
987 | movdqa \XMM5, \TMP4 | |
988 | pshufd $78, \XMM5, \TMP6 | |
989 | pxor \XMM5, \TMP6 | |
990 | paddd ONE(%rip), \XMM0 # INCR CNT | |
e5b954e8 | 991 | movdqu HashKey_4(%arg2), \TMP5 |
d7866e50 | 992 | pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 |
3c097b80 TS |
993 | movdqa \XMM0, \XMM1 |
994 | paddd ONE(%rip), \XMM0 # INCR CNT | |
995 | movdqa \XMM0, \XMM2 | |
996 | paddd ONE(%rip), \XMM0 # INCR CNT | |
997 | movdqa \XMM0, \XMM3 | |
998 | paddd ONE(%rip), \XMM0 # INCR CNT | |
999 | movdqa \XMM0, \XMM4 | |
d7866e50 UB |
1000 | pshufb %xmm15, \XMM1 # perform a 16 byte swap |
1001 | pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 | |
1002 | pshufb %xmm15, \XMM2 # perform a 16 byte swap | |
1003 | pshufb %xmm15, \XMM3 # perform a 16 byte swap | |
1004 | pshufb %xmm15, \XMM4 # perform a 16 byte swap | |
3c097b80 TS |
1005 | |
1006 | pxor (%arg1), \XMM1 | |
1007 | pxor (%arg1), \XMM2 | |
1008 | pxor (%arg1), \XMM3 | |
1009 | pxor (%arg1), \XMM4 | |
e5b954e8 | 1010 | movdqu HashKey_4_k(%arg2), \TMP5 |
d7866e50 | 1011 | pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) |
3c097b80 | 1012 | movaps 0x10(%arg1), \TMP1 |
d7866e50 UB |
1013 | aesenc \TMP1, \XMM1 # Round 1 |
1014 | aesenc \TMP1, \XMM2 | |
1015 | aesenc \TMP1, \XMM3 | |
1016 | aesenc \TMP1, \XMM4 | |
3c097b80 | 1017 | movaps 0x20(%arg1), \TMP1 |
d7866e50 UB |
1018 | aesenc \TMP1, \XMM1 # Round 2 |
1019 | aesenc \TMP1, \XMM2 | |
1020 | aesenc \TMP1, \XMM3 | |
1021 | aesenc \TMP1, \XMM4 | |
3c097b80 TS |
1022 | movdqa \XMM6, \TMP1 |
1023 | pshufd $78, \XMM6, \TMP2 | |
1024 | pxor \XMM6, \TMP2 | |
e5b954e8 | 1025 | movdqu HashKey_3(%arg2), \TMP5 |
d7866e50 | 1026 | pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 |
3c097b80 | 1027 | movaps 0x30(%arg1), \TMP3 |
d7866e50 UB |
1028 | aesenc \TMP3, \XMM1 # Round 3 |
1029 | aesenc \TMP3, \XMM2 | |
1030 | aesenc \TMP3, \XMM3 | |
1031 | aesenc \TMP3, \XMM4 | |
1032 | pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 | |
3c097b80 | 1033 | movaps 0x40(%arg1), \TMP3 |
d7866e50 UB |
1034 | aesenc \TMP3, \XMM1 # Round 4 |
1035 | aesenc \TMP3, \XMM2 | |
1036 | aesenc \TMP3, \XMM3 | |
1037 | aesenc \TMP3, \XMM4 | |
e5b954e8 | 1038 | movdqu HashKey_3_k(%arg2), \TMP5 |
d7866e50 | 1039 | pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) |
3c097b80 | 1040 | movaps 0x50(%arg1), \TMP3 |
d7866e50 UB |
1041 | aesenc \TMP3, \XMM1 # Round 5 |
1042 | aesenc \TMP3, \XMM2 | |
1043 | aesenc \TMP3, \XMM3 | |
1044 | aesenc \TMP3, \XMM4 | |
3c097b80 TS |
1045 | pxor \TMP1, \TMP4 |
1046 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | |
1047 | pxor \XMM6, \XMM5 | |
1048 | pxor \TMP2, \TMP6 | |
1049 | movdqa \XMM7, \TMP1 | |
1050 | pshufd $78, \XMM7, \TMP2 | |
1051 | pxor \XMM7, \TMP2 | |
e5b954e8 | 1052 | movdqu HashKey_2(%arg2), \TMP5 |
3c097b80 TS |
1053 | |
1054 | # Multiply TMP5 * HashKey using karatsuba | |
1055 | ||
d7866e50 | 1056 | pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 |
3c097b80 | 1057 | movaps 0x60(%arg1), \TMP3 |
d7866e50 UB |
1058 | aesenc \TMP3, \XMM1 # Round 6 |
1059 | aesenc \TMP3, \XMM2 | |
1060 | aesenc \TMP3, \XMM3 | |
1061 | aesenc \TMP3, \XMM4 | |
1062 | pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 | |
3c097b80 | 1063 | movaps 0x70(%arg1), \TMP3 |
d7866e50 UB |
1064 | aesenc \TMP3, \XMM1 # Round 7 |
1065 | aesenc \TMP3, \XMM2 | |
1066 | aesenc \TMP3, \XMM3 | |
1067 | aesenc \TMP3, \XMM4 | |
e5b954e8 | 1068 | movdqu HashKey_2_k(%arg2), \TMP5 |
d7866e50 | 1069 | pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) |
3c097b80 | 1070 | movaps 0x80(%arg1), \TMP3 |
d7866e50 UB |
1071 | aesenc \TMP3, \XMM1 # Round 8 |
1072 | aesenc \TMP3, \XMM2 | |
1073 | aesenc \TMP3, \XMM3 | |
1074 | aesenc \TMP3, \XMM4 | |
3c097b80 TS |
1075 | pxor \TMP1, \TMP4 |
1076 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | |
1077 | pxor \XMM7, \XMM5 | |
1078 | pxor \TMP2, \TMP6 | |
1079 | ||
1080 | # Multiply XMM8 * HashKey | |
1081 | # XMM8 and TMP5 hold the values for the two operands | |
1082 | ||
1083 | movdqa \XMM8, \TMP1 | |
1084 | pshufd $78, \XMM8, \TMP2 | |
1085 | pxor \XMM8, \TMP2 | |
e5b954e8 | 1086 | movdqu HashKey(%arg2), \TMP5 |
d7866e50 | 1087 | pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 |
3c097b80 | 1088 | movaps 0x90(%arg1), \TMP3 |
d7866e50 UB |
1089 | aesenc \TMP3, \XMM1 # Round 9 |
1090 | aesenc \TMP3, \XMM2 | |
1091 | aesenc \TMP3, \XMM3 | |
1092 | aesenc \TMP3, \XMM4 | |
1093 | pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 | |
e31ac32d TM |
1094 | lea 0xa0(%arg1),%r10 |
1095 | mov keysize,%eax | |
1096 | shr $2,%eax # 128->4, 192->6, 256->8 | |
1097 | sub $4,%eax # 128->0, 192->2, 256->4 | |
fb8986e6 | 1098 | jz aes_loop_par_enc_done\@ |
e31ac32d | 1099 | |
fb8986e6 | 1100 | aes_loop_par_enc\@: |
e31ac32d TM |
1101 | MOVADQ (%r10),\TMP3 |
1102 | .irpc index, 1234 | |
d7866e50 | 1103 | aesenc \TMP3, %xmm\index |
e31ac32d TM |
1104 | .endr |
1105 | add $16,%r10 | |
1106 | sub $1,%eax | |
fb8986e6 | 1107 | jnz aes_loop_par_enc\@ |
e31ac32d | 1108 | |
fb8986e6 | 1109 | aes_loop_par_enc_done\@: |
e31ac32d | 1110 | MOVADQ (%r10), \TMP3 |
d7866e50 UB |
1111 | aesenclast \TMP3, \XMM1 # Round 10 |
1112 | aesenclast \TMP3, \XMM2 | |
1113 | aesenclast \TMP3, \XMM3 | |
1114 | aesenclast \TMP3, \XMM4 | |
e5b954e8 | 1115 | movdqu HashKey_k(%arg2), \TMP5 |
d7866e50 | 1116 | pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) |
9ee4a5df | 1117 | movdqu (%arg4,%r11,1), \TMP3 |
3c097b80 | 1118 | pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK |
9ee4a5df | 1119 | movdqu 16(%arg4,%r11,1), \TMP3 |
3c097b80 | 1120 | pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK |
9ee4a5df | 1121 | movdqu 32(%arg4,%r11,1), \TMP3 |
3c097b80 | 1122 | pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK |
9ee4a5df | 1123 | movdqu 48(%arg4,%r11,1), \TMP3 |
3c097b80 | 1124 | pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK |
9ee4a5df DW |
1125 | movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer |
1126 | movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer | |
1127 | movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer | |
1128 | movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer | |
d7866e50 UB |
1129 | pshufb %xmm15, \XMM1 # perform a 16 byte swap |
1130 | pshufb %xmm15, \XMM2 # perform a 16 byte swap | |
1131 | pshufb %xmm15, \XMM3 # perform a 16 byte swap | |
1132 | pshufb %xmm15, \XMM4 # perform a 16 byte swap | |
3c097b80 TS |
1133 | |
1134 | pxor \TMP4, \TMP1 | |
1135 | pxor \XMM8, \XMM5 | |
1136 | pxor \TMP6, \TMP2 | |
1137 | pxor \TMP1, \TMP2 | |
1138 | pxor \XMM5, \TMP2 | |
1139 | movdqa \TMP2, \TMP3 | |
1140 | pslldq $8, \TMP3 # left shift TMP3 2 DWs | |
1141 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | |
1142 | pxor \TMP3, \XMM5 | |
1143 | pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 | |
1144 | ||
1145 | # first phase of reduction | |
1146 | ||
1147 | movdqa \XMM5, \TMP2 | |
1148 | movdqa \XMM5, \TMP3 | |
1149 | movdqa \XMM5, \TMP4 | |
1150 | # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently | |
1151 | pslld $31, \TMP2 # packed right shift << 31 | |
1152 | pslld $30, \TMP3 # packed right shift << 30 | |
1153 | pslld $25, \TMP4 # packed right shift << 25 | |
1154 | pxor \TMP3, \TMP2 # xor the shifted versions | |
1155 | pxor \TMP4, \TMP2 | |
1156 | movdqa \TMP2, \TMP5 | |
1157 | psrldq $4, \TMP5 # right shift T5 1 DW | |
1158 | pslldq $12, \TMP2 # left shift T2 3 DWs | |
1159 | pxor \TMP2, \XMM5 | |
1160 | ||
1161 | # second phase of reduction | |
1162 | ||
1163 | movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 | |
1164 | movdqa \XMM5,\TMP3 | |
1165 | movdqa \XMM5,\TMP4 | |
1166 | psrld $1, \TMP2 # packed left shift >>1 | |
1167 | psrld $2, \TMP3 # packed left shift >>2 | |
1168 | psrld $7, \TMP4 # packed left shift >>7 | |
1169 | pxor \TMP3,\TMP2 # xor the shifted versions | |
1170 | pxor \TMP4,\TMP2 | |
1171 | pxor \TMP5, \TMP2 | |
1172 | pxor \TMP2, \XMM5 | |
1173 | pxor \TMP1, \XMM5 # result is in TMP1 | |
1174 | ||
1175 | pxor \XMM5, \XMM1 | |
1176 | .endm | |
1177 | ||
1178 | /* | |
1179 | * decrypt 4 blocks at a time | |
1180 | * ghash the 4 previously decrypted ciphertext blocks | |
9ee4a5df | 1181 | * arg1, %arg3, %arg4 are used as pointers only, not modified |
3c097b80 TS |
1182 | * %r11 is the data offset value |
1183 | */ | |
3347c8a0 | 1184 | .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ |
0bd82f5f TS |
1185 | TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation |
1186 | ||
1187 | movdqa \XMM1, \XMM5 | |
1188 | movdqa \XMM2, \XMM6 | |
1189 | movdqa \XMM3, \XMM7 | |
1190 | movdqa \XMM4, \XMM8 | |
1191 | ||
3c097b80 | 1192 | movdqa SHUF_MASK(%rip), %xmm15 |
0bd82f5f TS |
1193 | # multiply TMP5 * HashKey using karatsuba |
1194 | ||
1195 | movdqa \XMM5, \TMP4 | |
1196 | pshufd $78, \XMM5, \TMP6 | |
1197 | pxor \XMM5, \TMP6 | |
1198 | paddd ONE(%rip), \XMM0 # INCR CNT | |
e5b954e8 | 1199 | movdqu HashKey_4(%arg2), \TMP5 |
d7866e50 | 1200 | pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 |
0bd82f5f TS |
1201 | movdqa \XMM0, \XMM1 |
1202 | paddd ONE(%rip), \XMM0 # INCR CNT | |
1203 | movdqa \XMM0, \XMM2 | |
1204 | paddd ONE(%rip), \XMM0 # INCR CNT | |
1205 | movdqa \XMM0, \XMM3 | |
1206 | paddd ONE(%rip), \XMM0 # INCR CNT | |
1207 | movdqa \XMM0, \XMM4 | |
d7866e50 UB |
1208 | pshufb %xmm15, \XMM1 # perform a 16 byte swap |
1209 | pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 | |
1210 | pshufb %xmm15, \XMM2 # perform a 16 byte swap | |
1211 | pshufb %xmm15, \XMM3 # perform a 16 byte swap | |
1212 | pshufb %xmm15, \XMM4 # perform a 16 byte swap | |
3c097b80 | 1213 | |
0bd82f5f TS |
1214 | pxor (%arg1), \XMM1 |
1215 | pxor (%arg1), \XMM2 | |
1216 | pxor (%arg1), \XMM3 | |
1217 | pxor (%arg1), \XMM4 | |
e5b954e8 | 1218 | movdqu HashKey_4_k(%arg2), \TMP5 |
d7866e50 | 1219 | pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) |
0bd82f5f | 1220 | movaps 0x10(%arg1), \TMP1 |
d7866e50 UB |
1221 | aesenc \TMP1, \XMM1 # Round 1 |
1222 | aesenc \TMP1, \XMM2 | |
1223 | aesenc \TMP1, \XMM3 | |
1224 | aesenc \TMP1, \XMM4 | |
0bd82f5f | 1225 | movaps 0x20(%arg1), \TMP1 |
d7866e50 UB |
1226 | aesenc \TMP1, \XMM1 # Round 2 |
1227 | aesenc \TMP1, \XMM2 | |
1228 | aesenc \TMP1, \XMM3 | |
1229 | aesenc \TMP1, \XMM4 | |
0bd82f5f TS |
1230 | movdqa \XMM6, \TMP1 |
1231 | pshufd $78, \XMM6, \TMP2 | |
1232 | pxor \XMM6, \TMP2 | |
e5b954e8 | 1233 | movdqu HashKey_3(%arg2), \TMP5 |
d7866e50 | 1234 | pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 |
0bd82f5f | 1235 | movaps 0x30(%arg1), \TMP3 |
d7866e50 UB |
1236 | aesenc \TMP3, \XMM1 # Round 3 |
1237 | aesenc \TMP3, \XMM2 | |
1238 | aesenc \TMP3, \XMM3 | |
1239 | aesenc \TMP3, \XMM4 | |
1240 | pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 | |
0bd82f5f | 1241 | movaps 0x40(%arg1), \TMP3 |
d7866e50 UB |
1242 | aesenc \TMP3, \XMM1 # Round 4 |
1243 | aesenc \TMP3, \XMM2 | |
1244 | aesenc \TMP3, \XMM3 | |
1245 | aesenc \TMP3, \XMM4 | |
e5b954e8 | 1246 | movdqu HashKey_3_k(%arg2), \TMP5 |
d7866e50 | 1247 | pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) |
0bd82f5f | 1248 | movaps 0x50(%arg1), \TMP3 |
d7866e50 UB |
1249 | aesenc \TMP3, \XMM1 # Round 5 |
1250 | aesenc \TMP3, \XMM2 | |
1251 | aesenc \TMP3, \XMM3 | |
1252 | aesenc \TMP3, \XMM4 | |
0bd82f5f TS |
1253 | pxor \TMP1, \TMP4 |
1254 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | |
1255 | pxor \XMM6, \XMM5 | |
1256 | pxor \TMP2, \TMP6 | |
1257 | movdqa \XMM7, \TMP1 | |
1258 | pshufd $78, \XMM7, \TMP2 | |
1259 | pxor \XMM7, \TMP2 | |
e5b954e8 | 1260 | movdqu HashKey_2(%arg2), \TMP5 |
0bd82f5f TS |
1261 | |
1262 | # Multiply TMP5 * HashKey using karatsuba | |
1263 | ||
d7866e50 | 1264 | pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 |
0bd82f5f | 1265 | movaps 0x60(%arg1), \TMP3 |
d7866e50 UB |
1266 | aesenc \TMP3, \XMM1 # Round 6 |
1267 | aesenc \TMP3, \XMM2 | |
1268 | aesenc \TMP3, \XMM3 | |
1269 | aesenc \TMP3, \XMM4 | |
1270 | pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 | |
0bd82f5f | 1271 | movaps 0x70(%arg1), \TMP3 |
d7866e50 UB |
1272 | aesenc \TMP3, \XMM1 # Round 7 |
1273 | aesenc \TMP3, \XMM2 | |
1274 | aesenc \TMP3, \XMM3 | |
1275 | aesenc \TMP3, \XMM4 | |
e5b954e8 | 1276 | movdqu HashKey_2_k(%arg2), \TMP5 |
d7866e50 | 1277 | pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) |
0bd82f5f | 1278 | movaps 0x80(%arg1), \TMP3 |
d7866e50 UB |
1279 | aesenc \TMP3, \XMM1 # Round 8 |
1280 | aesenc \TMP3, \XMM2 | |
1281 | aesenc \TMP3, \XMM3 | |
1282 | aesenc \TMP3, \XMM4 | |
0bd82f5f TS |
1283 | pxor \TMP1, \TMP4 |
1284 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | |
1285 | pxor \XMM7, \XMM5 | |
1286 | pxor \TMP2, \TMP6 | |
1287 | ||
1288 | # Multiply XMM8 * HashKey | |
1289 | # XMM8 and TMP5 hold the values for the two operands | |
1290 | ||
1291 | movdqa \XMM8, \TMP1 | |
1292 | pshufd $78, \XMM8, \TMP2 | |
1293 | pxor \XMM8, \TMP2 | |
e5b954e8 | 1294 | movdqu HashKey(%arg2), \TMP5 |
d7866e50 | 1295 | pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 |
0bd82f5f | 1296 | movaps 0x90(%arg1), \TMP3 |
d7866e50 UB |
1297 | aesenc \TMP3, \XMM1 # Round 9 |
1298 | aesenc \TMP3, \XMM2 | |
1299 | aesenc \TMP3, \XMM3 | |
1300 | aesenc \TMP3, \XMM4 | |
1301 | pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 | |
e31ac32d TM |
1302 | lea 0xa0(%arg1),%r10 |
1303 | mov keysize,%eax | |
1304 | shr $2,%eax # 128->4, 192->6, 256->8 | |
1305 | sub $4,%eax # 128->0, 192->2, 256->4 | |
fb8986e6 | 1306 | jz aes_loop_par_dec_done\@ |
e31ac32d | 1307 | |
fb8986e6 | 1308 | aes_loop_par_dec\@: |
e31ac32d TM |
1309 | MOVADQ (%r10),\TMP3 |
1310 | .irpc index, 1234 | |
d7866e50 | 1311 | aesenc \TMP3, %xmm\index |
e31ac32d TM |
1312 | .endr |
1313 | add $16,%r10 | |
1314 | sub $1,%eax | |
fb8986e6 | 1315 | jnz aes_loop_par_dec\@ |
e31ac32d | 1316 | |
fb8986e6 | 1317 | aes_loop_par_dec_done\@: |
e31ac32d | 1318 | MOVADQ (%r10), \TMP3 |
d7866e50 UB |
1319 | aesenclast \TMP3, \XMM1 # last round |
1320 | aesenclast \TMP3, \XMM2 | |
1321 | aesenclast \TMP3, \XMM3 | |
1322 | aesenclast \TMP3, \XMM4 | |
e5b954e8 | 1323 | movdqu HashKey_k(%arg2), \TMP5 |
d7866e50 | 1324 | pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) |
9ee4a5df | 1325 | movdqu (%arg4,%r11,1), \TMP3 |
0bd82f5f | 1326 | pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK |
9ee4a5df | 1327 | movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer |
0bd82f5f | 1328 | movdqa \TMP3, \XMM1 |
9ee4a5df | 1329 | movdqu 16(%arg4,%r11,1), \TMP3 |
0bd82f5f | 1330 | pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK |
9ee4a5df | 1331 | movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer |
0bd82f5f | 1332 | movdqa \TMP3, \XMM2 |
9ee4a5df | 1333 | movdqu 32(%arg4,%r11,1), \TMP3 |
0bd82f5f | 1334 | pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK |
9ee4a5df | 1335 | movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer |
0bd82f5f | 1336 | movdqa \TMP3, \XMM3 |
9ee4a5df | 1337 | movdqu 48(%arg4,%r11,1), \TMP3 |
0bd82f5f | 1338 | pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK |
9ee4a5df | 1339 | movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer |
0bd82f5f | 1340 | movdqa \TMP3, \XMM4 |
d7866e50 UB |
1341 | pshufb %xmm15, \XMM1 # perform a 16 byte swap |
1342 | pshufb %xmm15, \XMM2 # perform a 16 byte swap | |
1343 | pshufb %xmm15, \XMM3 # perform a 16 byte swap | |
1344 | pshufb %xmm15, \XMM4 # perform a 16 byte swap | |
0bd82f5f TS |
1345 | |
1346 | pxor \TMP4, \TMP1 | |
1347 | pxor \XMM8, \XMM5 | |
1348 | pxor \TMP6, \TMP2 | |
1349 | pxor \TMP1, \TMP2 | |
1350 | pxor \XMM5, \TMP2 | |
1351 | movdqa \TMP2, \TMP3 | |
1352 | pslldq $8, \TMP3 # left shift TMP3 2 DWs | |
1353 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | |
1354 | pxor \TMP3, \XMM5 | |
1355 | pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 | |
1356 | ||
1357 | # first phase of reduction | |
1358 | ||
1359 | movdqa \XMM5, \TMP2 | |
1360 | movdqa \XMM5, \TMP3 | |
1361 | movdqa \XMM5, \TMP4 | |
1362 | # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently | |
1363 | pslld $31, \TMP2 # packed right shift << 31 | |
1364 | pslld $30, \TMP3 # packed right shift << 30 | |
1365 | pslld $25, \TMP4 # packed right shift << 25 | |
1366 | pxor \TMP3, \TMP2 # xor the shifted versions | |
1367 | pxor \TMP4, \TMP2 | |
1368 | movdqa \TMP2, \TMP5 | |
1369 | psrldq $4, \TMP5 # right shift T5 1 DW | |
1370 | pslldq $12, \TMP2 # left shift T2 3 DWs | |
1371 | pxor \TMP2, \XMM5 | |
1372 | ||
1373 | # second phase of reduction | |
1374 | ||
1375 | movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 | |
1376 | movdqa \XMM5,\TMP3 | |
1377 | movdqa \XMM5,\TMP4 | |
1378 | psrld $1, \TMP2 # packed left shift >>1 | |
1379 | psrld $2, \TMP3 # packed left shift >>2 | |
1380 | psrld $7, \TMP4 # packed left shift >>7 | |
1381 | pxor \TMP3,\TMP2 # xor the shifted versions | |
1382 | pxor \TMP4,\TMP2 | |
1383 | pxor \TMP5, \TMP2 | |
1384 | pxor \TMP2, \XMM5 | |
1385 | pxor \TMP1, \XMM5 # result is in TMP1 | |
1386 | ||
1387 | pxor \XMM5, \XMM1 | |
1388 | .endm | |
1389 | ||
1390 | /* GHASH the last 4 ciphertext blocks. */ | |
1391 | .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ | |
1392 | TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst | |
1393 | ||
1394 | # Multiply TMP6 * HashKey (using Karatsuba) | |
1395 | ||
1396 | movdqa \XMM1, \TMP6 | |
1397 | pshufd $78, \XMM1, \TMP2 | |
1398 | pxor \XMM1, \TMP2 | |
e5b954e8 | 1399 | movdqu HashKey_4(%arg2), \TMP5 |
d7866e50 UB |
1400 | pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 |
1401 | pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 | |
e5b954e8 | 1402 | movdqu HashKey_4_k(%arg2), \TMP4 |
d7866e50 | 1403 | pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) |
0bd82f5f TS |
1404 | movdqa \XMM1, \XMMDst |
1405 | movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 | |
1406 | ||
1407 | # Multiply TMP1 * HashKey (using Karatsuba) | |
1408 | ||
1409 | movdqa \XMM2, \TMP1 | |
1410 | pshufd $78, \XMM2, \TMP2 | |
1411 | pxor \XMM2, \TMP2 | |
e5b954e8 | 1412 | movdqu HashKey_3(%arg2), \TMP5 |
d7866e50 UB |
1413 | pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 |
1414 | pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 | |
e5b954e8 | 1415 | movdqu HashKey_3_k(%arg2), \TMP4 |
d7866e50 | 1416 | pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) |
0bd82f5f TS |
1417 | pxor \TMP1, \TMP6 |
1418 | pxor \XMM2, \XMMDst | |
1419 | pxor \TMP2, \XMM1 | |
1420 | # results accumulated in TMP6, XMMDst, XMM1 | |
1421 | ||
1422 | # Multiply TMP1 * HashKey (using Karatsuba) | |
1423 | ||
1424 | movdqa \XMM3, \TMP1 | |
1425 | pshufd $78, \XMM3, \TMP2 | |
1426 | pxor \XMM3, \TMP2 | |
e5b954e8 | 1427 | movdqu HashKey_2(%arg2), \TMP5 |
d7866e50 UB |
1428 | pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 |
1429 | pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 | |
e5b954e8 | 1430 | movdqu HashKey_2_k(%arg2), \TMP4 |
d7866e50 | 1431 | pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) |
0bd82f5f TS |
1432 | pxor \TMP1, \TMP6 |
1433 | pxor \XMM3, \XMMDst | |
1434 | pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 | |
1435 | ||
1436 | # Multiply TMP1 * HashKey (using Karatsuba) | |
1437 | movdqa \XMM4, \TMP1 | |
1438 | pshufd $78, \XMM4, \TMP2 | |
1439 | pxor \XMM4, \TMP2 | |
e5b954e8 | 1440 | movdqu HashKey(%arg2), \TMP5 |
d7866e50 UB |
1441 | pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 |
1442 | pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 | |
e5b954e8 | 1443 | movdqu HashKey_k(%arg2), \TMP4 |
d7866e50 | 1444 | pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) |
0bd82f5f TS |
1445 | pxor \TMP1, \TMP6 |
1446 | pxor \XMM4, \XMMDst | |
1447 | pxor \XMM1, \TMP2 | |
1448 | pxor \TMP6, \TMP2 | |
1449 | pxor \XMMDst, \TMP2 | |
1450 | # middle section of the temp results combined as in karatsuba algorithm | |
1451 | movdqa \TMP2, \TMP4 | |
1452 | pslldq $8, \TMP4 # left shift TMP4 2 DWs | |
1453 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | |
1454 | pxor \TMP4, \XMMDst | |
1455 | pxor \TMP2, \TMP6 | |
1456 | # TMP6:XMMDst holds the result of the accumulated carry-less multiplications | |
1457 | # first phase of the reduction | |
1458 | movdqa \XMMDst, \TMP2 | |
1459 | movdqa \XMMDst, \TMP3 | |
1460 | movdqa \XMMDst, \TMP4 | |
1461 | # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently | |
1462 | pslld $31, \TMP2 # packed right shifting << 31 | |
1463 | pslld $30, \TMP3 # packed right shifting << 30 | |
1464 | pslld $25, \TMP4 # packed right shifting << 25 | |
1465 | pxor \TMP3, \TMP2 # xor the shifted versions | |
1466 | pxor \TMP4, \TMP2 | |
1467 | movdqa \TMP2, \TMP7 | |
1468 | psrldq $4, \TMP7 # right shift TMP7 1 DW | |
1469 | pslldq $12, \TMP2 # left shift TMP2 3 DWs | |
1470 | pxor \TMP2, \XMMDst | |
1471 | ||
1472 | # second phase of the reduction | |
1473 | movdqa \XMMDst, \TMP2 | |
1474 | # make 3 copies of XMMDst for doing 3 shift operations | |
1475 | movdqa \XMMDst, \TMP3 | |
1476 | movdqa \XMMDst, \TMP4 | |
1477 | psrld $1, \TMP2 # packed left shift >> 1 | |
1478 | psrld $2, \TMP3 # packed left shift >> 2 | |
1479 | psrld $7, \TMP4 # packed left shift >> 7 | |
1480 | pxor \TMP3, \TMP2 # xor the shifted versions | |
1481 | pxor \TMP4, \TMP2 | |
1482 | pxor \TMP7, \TMP2 | |
1483 | pxor \TMP2, \XMMDst | |
1484 | pxor \TMP6, \XMMDst # reduced result is in XMMDst | |
1485 | .endm | |
1486 | ||
0bd82f5f | 1487 | |
e31ac32d TM |
1488 | /* Encryption of a single block |
1489 | * uses eax & r10 | |
1490 | */ | |
0bd82f5f | 1491 | |
e31ac32d | 1492 | .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 |
0bd82f5f | 1493 | |
e31ac32d TM |
1494 | pxor (%arg1), \XMM0 |
1495 | mov keysize,%eax | |
1496 | shr $2,%eax # 128->4, 192->6, 256->8 | |
1497 | add $5,%eax # 128->9, 192->11, 256->13 | |
1498 | lea 16(%arg1), %r10 # get first expanded key address | |
1499 | ||
1500 | _esb_loop_\@: | |
1501 | MOVADQ (%r10),\TMP1 | |
d7866e50 | 1502 | aesenc \TMP1,\XMM0 |
e31ac32d TM |
1503 | add $16,%r10 |
1504 | sub $1,%eax | |
1505 | jnz _esb_loop_\@ | |
1506 | ||
1507 | MOVADQ (%r10),\TMP1 | |
d7866e50 | 1508 | aesenclast \TMP1,\XMM0 |
e31ac32d | 1509 | .endm |
0bd82f5f TS |
1510 | /***************************************************************************** |
1511 | * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | |
9ee4a5df DW |
1512 | * struct gcm_context_data *data |
1513 | * // Context data | |
0bd82f5f TS |
1514 | * u8 *out, // Plaintext output. Encrypt in-place is allowed. |
1515 | * const u8 *in, // Ciphertext input | |
1516 | * u64 plaintext_len, // Length of data in bytes for decryption. | |
1517 | * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) | |
1518 | * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) | |
1519 | * // concatenated with 0x00000001. 16-byte aligned pointer. | |
1520 | * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. | |
1521 | * const u8 *aad, // Additional Authentication Data (AAD) | |
1522 | * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes | |
1523 | * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the | |
1524 | * // given authentication tag and only return the plaintext if they match. | |
1525 | * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 | |
1526 | * // (most likely), 12 or 8. | |
1527 | * | |
1528 | * Assumptions: | |
1529 | * | |
1530 | * keys: | |
1531 | * keys are pre-expanded and aligned to 16 bytes. we are using the first | |
1532 | * set of 11 keys in the data structure void *aes_ctx | |
1533 | * | |
1534 | * iv: | |
1535 | * 0 1 2 3 | |
1536 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
1537 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1538 | * | Salt (From the SA) | | |
1539 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1540 | * | Initialization Vector | | |
1541 | * | (This is the sequence number from IPSec header) | | |
1542 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1543 | * | 0x1 | | |
1544 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1545 | * | |
1546 | * | |
1547 | * | |
1548 | * AAD: | |
1549 | * AAD padded to 128 bits with 0 | |
1550 | * for example, assume AAD is a u32 vector | |
1551 | * | |
1552 | * if AAD is 8 bytes: | |
1553 | * AAD[3] = {A0, A1}; | |
1554 | * padded AAD in xmm register = {A1 A0 0 0} | |
1555 | * | |
1556 | * 0 1 2 3 | |
1557 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
1558 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1559 | * | SPI (A1) | | |
1560 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1561 | * | 32-bit Sequence Number (A0) | | |
1562 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1563 | * | 0x0 | | |
1564 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1565 | * | |
1566 | * AAD Format with 32-bit Sequence Number | |
1567 | * | |
1568 | * if AAD is 12 bytes: | |
1569 | * AAD[3] = {A0, A1, A2}; | |
1570 | * padded AAD in xmm register = {A2 A1 A0 0} | |
1571 | * | |
1572 | * 0 1 2 3 | |
1573 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
1574 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1575 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
1576 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1577 | * | SPI (A2) | | |
1578 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1579 | * | 64-bit Extended Sequence Number {A1,A0} | | |
1580 | * | | | |
1581 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1582 | * | 0x0 | | |
1583 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1584 | * | |
1585 | * AAD Format with 64-bit Extended Sequence Number | |
1586 | * | |
0bd82f5f TS |
1587 | * poly = x^128 + x^127 + x^126 + x^121 + 1 |
1588 | * | |
1589 | *****************************************************************************/ | |
6dcc5627 | 1590 | SYM_FUNC_START(aesni_gcm_dec) |
6c2c86b3 | 1591 | FUNC_SAVE |
0bd82f5f | 1592 | |
fb8986e6 | 1593 | GCM_INIT %arg6, arg7, arg8, arg9 |
ba45833e | 1594 | GCM_ENC_DEC dec |
fb8986e6 | 1595 | GCM_COMPLETE arg10, arg11 |
6c2c86b3 | 1596 | FUNC_RESTORE |
f94909ce | 1597 | RET |
6dcc5627 | 1598 | SYM_FUNC_END(aesni_gcm_dec) |
0bd82f5f TS |
1599 | |
1600 | ||
1601 | /***************************************************************************** | |
1602 | * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | |
9ee4a5df DW |
1603 | * struct gcm_context_data *data |
1604 | * // Context data | |
0bd82f5f TS |
1605 | * u8 *out, // Ciphertext output. Encrypt in-place is allowed. |
1606 | * const u8 *in, // Plaintext input | |
1607 | * u64 plaintext_len, // Length of data in bytes for encryption. | |
1608 | * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) | |
1609 | * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) | |
1610 | * // concatenated with 0x00000001. 16-byte aligned pointer. | |
1611 | * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. | |
1612 | * const u8 *aad, // Additional Authentication Data (AAD) | |
1613 | * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes | |
1614 | * u8 *auth_tag, // Authenticated Tag output. | |
1615 | * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), | |
1616 | * // 12 or 8. | |
1617 | * | |
1618 | * Assumptions: | |
1619 | * | |
1620 | * keys: | |
1621 | * keys are pre-expanded and aligned to 16 bytes. we are using the | |
1622 | * first set of 11 keys in the data structure void *aes_ctx | |
1623 | * | |
1624 | * | |
1625 | * iv: | |
1626 | * 0 1 2 3 | |
1627 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
1628 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1629 | * | Salt (From the SA) | | |
1630 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1631 | * | Initialization Vector | | |
1632 | * | (This is the sequence number from IPSec header) | | |
1633 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1634 | * | 0x1 | | |
1635 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1636 | * | |
1637 | * | |
1638 | * | |
1639 | * AAD: | |
1640 | * AAD padded to 128 bits with 0 | |
1641 | * for example, assume AAD is a u32 vector | |
1642 | * | |
1643 | * if AAD is 8 bytes: | |
1644 | * AAD[3] = {A0, A1}; | |
1645 | * padded AAD in xmm register = {A1 A0 0 0} | |
1646 | * | |
1647 | * 0 1 2 3 | |
1648 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
1649 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1650 | * | SPI (A1) | | |
1651 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1652 | * | 32-bit Sequence Number (A0) | | |
1653 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1654 | * | 0x0 | | |
1655 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1656 | * | |
1657 | * AAD Format with 32-bit Sequence Number | |
1658 | * | |
1659 | * if AAD is 12 bytes: | |
1660 | * AAD[3] = {A0, A1, A2}; | |
1661 | * padded AAD in xmm register = {A2 A1 A0 0} | |
1662 | * | |
1663 | * 0 1 2 3 | |
1664 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |
1665 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1666 | * | SPI (A2) | | |
1667 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1668 | * | 64-bit Extended Sequence Number {A1,A0} | | |
1669 | * | | | |
1670 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1671 | * | 0x0 | | |
1672 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
1673 | * | |
1674 | * AAD Format with 64-bit Extended Sequence Number | |
1675 | * | |
0bd82f5f TS |
1676 | * poly = x^128 + x^127 + x^126 + x^121 + 1 |
1677 | ***************************************************************************/ | |
6dcc5627 | 1678 | SYM_FUNC_START(aesni_gcm_enc) |
6c2c86b3 | 1679 | FUNC_SAVE |
0bd82f5f | 1680 | |
fb8986e6 | 1681 | GCM_INIT %arg6, arg7, arg8, arg9 |
ba45833e | 1682 | GCM_ENC_DEC enc |
fb8986e6 DW |
1683 | |
1684 | GCM_COMPLETE arg10, arg11 | |
6c2c86b3 | 1685 | FUNC_RESTORE |
f94909ce | 1686 | RET |
6dcc5627 | 1687 | SYM_FUNC_END(aesni_gcm_enc) |
3c097b80 | 1688 | |
fb8986e6 DW |
1689 | /***************************************************************************** |
1690 | * void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | |
1691 | * struct gcm_context_data *data, | |
1692 | * // context data | |
1693 | * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) | |
1694 | * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) | |
1695 | * // concatenated with 0x00000001. 16-byte aligned pointer. | |
1696 | * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. | |
1697 | * const u8 *aad, // Additional Authentication Data (AAD) | |
1698 | * u64 aad_len) // Length of AAD in bytes. | |
1699 | */ | |
6dcc5627 | 1700 | SYM_FUNC_START(aesni_gcm_init) |
fb8986e6 DW |
1701 | FUNC_SAVE |
1702 | GCM_INIT %arg3, %arg4,%arg5, %arg6 | |
1703 | FUNC_RESTORE | |
f94909ce | 1704 | RET |
6dcc5627 | 1705 | SYM_FUNC_END(aesni_gcm_init) |
fb8986e6 DW |
1706 | |
1707 | /***************************************************************************** | |
1708 | * void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | |
1709 | * struct gcm_context_data *data, | |
1710 | * // context data | |
1711 | * u8 *out, // Ciphertext output. Encrypt in-place is allowed. | |
1712 | * const u8 *in, // Plaintext input | |
1713 | * u64 plaintext_len, // Length of data in bytes for encryption. | |
1714 | */ | |
6dcc5627 | 1715 | SYM_FUNC_START(aesni_gcm_enc_update) |
fb8986e6 DW |
1716 | FUNC_SAVE |
1717 | GCM_ENC_DEC enc | |
1718 | FUNC_RESTORE | |
f94909ce | 1719 | RET |
6dcc5627 | 1720 | SYM_FUNC_END(aesni_gcm_enc_update) |
fb8986e6 DW |
1721 | |
1722 | /***************************************************************************** | |
1723 | * void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | |
1724 | * struct gcm_context_data *data, | |
1725 | * // context data | |
1726 | * u8 *out, // Ciphertext output. Encrypt in-place is allowed. | |
1727 | * const u8 *in, // Plaintext input | |
1728 | * u64 plaintext_len, // Length of data in bytes for encryption. | |
1729 | */ | |
6dcc5627 | 1730 | SYM_FUNC_START(aesni_gcm_dec_update) |
fb8986e6 DW |
1731 | FUNC_SAVE |
1732 | GCM_ENC_DEC dec | |
1733 | FUNC_RESTORE | |
f94909ce | 1734 | RET |
6dcc5627 | 1735 | SYM_FUNC_END(aesni_gcm_dec_update) |
fb8986e6 DW |
1736 | |
1737 | /***************************************************************************** | |
1738 | * void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | |
1739 | * struct gcm_context_data *data, | |
1740 | * // context data | |
1741 | * u8 *auth_tag, // Authenticated Tag output. | |
1742 | * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), | |
1743 | * // 12 or 8. | |
1744 | */ | |
6dcc5627 | 1745 | SYM_FUNC_START(aesni_gcm_finalize) |
fb8986e6 DW |
1746 | FUNC_SAVE |
1747 | GCM_COMPLETE %arg3 %arg4 | |
1748 | FUNC_RESTORE | |
f94909ce | 1749 | RET |
6dcc5627 | 1750 | SYM_FUNC_END(aesni_gcm_finalize) |
fb8986e6 | 1751 | |
559ad0ff | 1752 | #endif |
0bd82f5f | 1753 | |
74d8b90a | 1754 | SYM_FUNC_START_LOCAL(_key_expansion_256a) |
54b6a1bd HY |
1755 | pshufd $0b11111111, %xmm1, %xmm1 |
1756 | shufps $0b00010000, %xmm0, %xmm4 | |
1757 | pxor %xmm4, %xmm0 | |
1758 | shufps $0b10001100, %xmm0, %xmm4 | |
1759 | pxor %xmm4, %xmm0 | |
1760 | pxor %xmm1, %xmm0 | |
0d258efb MK |
1761 | movaps %xmm0, (TKEYP) |
1762 | add $0x10, TKEYP | |
f94909ce | 1763 | RET |
74d8b90a | 1764 | SYM_FUNC_END(_key_expansion_256a) |
7be2e319 | 1765 | SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a) |
54b6a1bd | 1766 | |
74d8b90a | 1767 | SYM_FUNC_START_LOCAL(_key_expansion_192a) |
54b6a1bd HY |
1768 | pshufd $0b01010101, %xmm1, %xmm1 |
1769 | shufps $0b00010000, %xmm0, %xmm4 | |
1770 | pxor %xmm4, %xmm0 | |
1771 | shufps $0b10001100, %xmm0, %xmm4 | |
1772 | pxor %xmm4, %xmm0 | |
1773 | pxor %xmm1, %xmm0 | |
1774 | ||
1775 | movaps %xmm2, %xmm5 | |
1776 | movaps %xmm2, %xmm6 | |
1777 | pslldq $4, %xmm5 | |
1778 | pshufd $0b11111111, %xmm0, %xmm3 | |
1779 | pxor %xmm3, %xmm2 | |
1780 | pxor %xmm5, %xmm2 | |
1781 | ||
1782 | movaps %xmm0, %xmm1 | |
1783 | shufps $0b01000100, %xmm0, %xmm6 | |
0d258efb | 1784 | movaps %xmm6, (TKEYP) |
54b6a1bd | 1785 | shufps $0b01001110, %xmm2, %xmm1 |
0d258efb MK |
1786 | movaps %xmm1, 0x10(TKEYP) |
1787 | add $0x20, TKEYP | |
f94909ce | 1788 | RET |
74d8b90a | 1789 | SYM_FUNC_END(_key_expansion_192a) |
54b6a1bd | 1790 | |
74d8b90a | 1791 | SYM_FUNC_START_LOCAL(_key_expansion_192b) |
54b6a1bd HY |
1792 | pshufd $0b01010101, %xmm1, %xmm1 |
1793 | shufps $0b00010000, %xmm0, %xmm4 | |
1794 | pxor %xmm4, %xmm0 | |
1795 | shufps $0b10001100, %xmm0, %xmm4 | |
1796 | pxor %xmm4, %xmm0 | |
1797 | pxor %xmm1, %xmm0 | |
1798 | ||
1799 | movaps %xmm2, %xmm5 | |
1800 | pslldq $4, %xmm5 | |
1801 | pshufd $0b11111111, %xmm0, %xmm3 | |
1802 | pxor %xmm3, %xmm2 | |
1803 | pxor %xmm5, %xmm2 | |
1804 | ||
0d258efb MK |
1805 | movaps %xmm0, (TKEYP) |
1806 | add $0x10, TKEYP | |
f94909ce | 1807 | RET |
74d8b90a | 1808 | SYM_FUNC_END(_key_expansion_192b) |
54b6a1bd | 1809 | |
74d8b90a | 1810 | SYM_FUNC_START_LOCAL(_key_expansion_256b) |
54b6a1bd HY |
1811 | pshufd $0b10101010, %xmm1, %xmm1 |
1812 | shufps $0b00010000, %xmm2, %xmm4 | |
1813 | pxor %xmm4, %xmm2 | |
1814 | shufps $0b10001100, %xmm2, %xmm4 | |
1815 | pxor %xmm4, %xmm2 | |
1816 | pxor %xmm1, %xmm2 | |
0d258efb MK |
1817 | movaps %xmm2, (TKEYP) |
1818 | add $0x10, TKEYP | |
f94909ce | 1819 | RET |
74d8b90a | 1820 | SYM_FUNC_END(_key_expansion_256b) |
54b6a1bd HY |
1821 | |
1822 | /* | |
1823 | * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, | |
1824 | * unsigned int key_len) | |
1825 | */ | |
6dcc5627 | 1826 | SYM_FUNC_START(aesni_set_key) |
8691ccd7 | 1827 | FRAME_BEGIN |
0d258efb MK |
1828 | #ifndef __x86_64__ |
1829 | pushl KEYP | |
8691ccd7 JP |
1830 | movl (FRAME_OFFSET+8)(%esp), KEYP # ctx |
1831 | movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key | |
1832 | movl (FRAME_OFFSET+16)(%esp), %edx # key_len | |
0d258efb MK |
1833 | #endif |
1834 | movups (UKEYP), %xmm0 # user key (first 16 bytes) | |
1835 | movaps %xmm0, (KEYP) | |
1836 | lea 0x10(KEYP), TKEYP # key addr | |
1837 | movl %edx, 480(KEYP) | |
54b6a1bd HY |
1838 | pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x |
1839 | cmp $24, %dl | |
1840 | jb .Lenc_key128 | |
1841 | je .Lenc_key192 | |
0d258efb MK |
1842 | movups 0x10(UKEYP), %xmm2 # other user key |
1843 | movaps %xmm2, (TKEYP) | |
1844 | add $0x10, TKEYP | |
d7866e50 | 1845 | aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 |
54b6a1bd | 1846 | call _key_expansion_256a |
d7866e50 | 1847 | aeskeygenassist $0x1, %xmm0, %xmm1 |
54b6a1bd | 1848 | call _key_expansion_256b |
d7866e50 | 1849 | aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 |
54b6a1bd | 1850 | call _key_expansion_256a |
d7866e50 | 1851 | aeskeygenassist $0x2, %xmm0, %xmm1 |
54b6a1bd | 1852 | call _key_expansion_256b |
d7866e50 | 1853 | aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 |
54b6a1bd | 1854 | call _key_expansion_256a |
d7866e50 | 1855 | aeskeygenassist $0x4, %xmm0, %xmm1 |
54b6a1bd | 1856 | call _key_expansion_256b |
d7866e50 | 1857 | aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 |
54b6a1bd | 1858 | call _key_expansion_256a |
d7866e50 | 1859 | aeskeygenassist $0x8, %xmm0, %xmm1 |
54b6a1bd | 1860 | call _key_expansion_256b |
d7866e50 | 1861 | aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 |
54b6a1bd | 1862 | call _key_expansion_256a |
d7866e50 | 1863 | aeskeygenassist $0x10, %xmm0, %xmm1 |
54b6a1bd | 1864 | call _key_expansion_256b |
d7866e50 | 1865 | aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 |
54b6a1bd | 1866 | call _key_expansion_256a |
d7866e50 | 1867 | aeskeygenassist $0x20, %xmm0, %xmm1 |
54b6a1bd | 1868 | call _key_expansion_256b |
d7866e50 | 1869 | aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 |
54b6a1bd HY |
1870 | call _key_expansion_256a |
1871 | jmp .Ldec_key | |
1872 | .Lenc_key192: | |
0d258efb | 1873 | movq 0x10(UKEYP), %xmm2 # other user key |
d7866e50 | 1874 | aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 |
54b6a1bd | 1875 | call _key_expansion_192a |
d7866e50 | 1876 | aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 |
54b6a1bd | 1877 | call _key_expansion_192b |
d7866e50 | 1878 | aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 |
54b6a1bd | 1879 | call _key_expansion_192a |
d7866e50 | 1880 | aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 |
54b6a1bd | 1881 | call _key_expansion_192b |
d7866e50 | 1882 | aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 |
54b6a1bd | 1883 | call _key_expansion_192a |
d7866e50 | 1884 | aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 |
54b6a1bd | 1885 | call _key_expansion_192b |
d7866e50 | 1886 | aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 |
54b6a1bd | 1887 | call _key_expansion_192a |
d7866e50 | 1888 | aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 |
54b6a1bd HY |
1889 | call _key_expansion_192b |
1890 | jmp .Ldec_key | |
1891 | .Lenc_key128: | |
d7866e50 | 1892 | aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 |
54b6a1bd | 1893 | call _key_expansion_128 |
d7866e50 | 1894 | aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 |
54b6a1bd | 1895 | call _key_expansion_128 |
d7866e50 | 1896 | aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 |
54b6a1bd | 1897 | call _key_expansion_128 |
d7866e50 | 1898 | aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 |
54b6a1bd | 1899 | call _key_expansion_128 |
d7866e50 | 1900 | aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 |
54b6a1bd | 1901 | call _key_expansion_128 |
d7866e50 | 1902 | aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 |
54b6a1bd | 1903 | call _key_expansion_128 |
d7866e50 | 1904 | aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 |
54b6a1bd | 1905 | call _key_expansion_128 |
d7866e50 | 1906 | aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 |
54b6a1bd | 1907 | call _key_expansion_128 |
d7866e50 | 1908 | aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 |
54b6a1bd | 1909 | call _key_expansion_128 |
d7866e50 | 1910 | aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 |
54b6a1bd HY |
1911 | call _key_expansion_128 |
1912 | .Ldec_key: | |
0d258efb MK |
1913 | sub $0x10, TKEYP |
1914 | movaps (KEYP), %xmm0 | |
1915 | movaps (TKEYP), %xmm1 | |
1916 | movaps %xmm0, 240(TKEYP) | |
1917 | movaps %xmm1, 240(KEYP) | |
1918 | add $0x10, KEYP | |
1919 | lea 240-16(TKEYP), UKEYP | |
54b6a1bd HY |
1920 | .align 4 |
1921 | .Ldec_key_loop: | |
0d258efb | 1922 | movaps (KEYP), %xmm0 |
d7866e50 | 1923 | aesimc %xmm0, %xmm1 |
0d258efb MK |
1924 | movaps %xmm1, (UKEYP) |
1925 | add $0x10, KEYP | |
1926 | sub $0x10, UKEYP | |
1927 | cmp TKEYP, KEYP | |
54b6a1bd | 1928 | jb .Ldec_key_loop |
0d258efb MK |
1929 | xor AREG, AREG |
1930 | #ifndef __x86_64__ | |
1931 | popl KEYP | |
1932 | #endif | |
8691ccd7 | 1933 | FRAME_END |
f94909ce | 1934 | RET |
6dcc5627 | 1935 | SYM_FUNC_END(aesni_set_key) |
54b6a1bd HY |
1936 | |
1937 | /* | |
9c1e8836 | 1938 | * void aesni_enc(const void *ctx, u8 *dst, const u8 *src) |
54b6a1bd | 1939 | */ |
6dcc5627 | 1940 | SYM_FUNC_START(aesni_enc) |
8691ccd7 | 1941 | FRAME_BEGIN |
0d258efb MK |
1942 | #ifndef __x86_64__ |
1943 | pushl KEYP | |
1944 | pushl KLEN | |
8691ccd7 JP |
1945 | movl (FRAME_OFFSET+12)(%esp), KEYP # ctx |
1946 | movl (FRAME_OFFSET+16)(%esp), OUTP # dst | |
1947 | movl (FRAME_OFFSET+20)(%esp), INP # src | |
0d258efb | 1948 | #endif |
54b6a1bd HY |
1949 | movl 480(KEYP), KLEN # key length |
1950 | movups (INP), STATE # input | |
1951 | call _aesni_enc1 | |
1952 | movups STATE, (OUTP) # output | |
0d258efb MK |
1953 | #ifndef __x86_64__ |
1954 | popl KLEN | |
1955 | popl KEYP | |
1956 | #endif | |
8691ccd7 | 1957 | FRAME_END |
f94909ce | 1958 | RET |
6dcc5627 | 1959 | SYM_FUNC_END(aesni_enc) |
54b6a1bd HY |
1960 | |
1961 | /* | |
1962 | * _aesni_enc1: internal ABI | |
1963 | * input: | |
1964 | * KEYP: key struct pointer | |
1965 | * KLEN: round count | |
1966 | * STATE: initial state (input) | |
1967 | * output: | |
1968 | * STATE: finial state (output) | |
1969 | * changed: | |
1970 | * KEY | |
1971 | * TKEYP (T1) | |
1972 | */ | |
74d8b90a | 1973 | SYM_FUNC_START_LOCAL(_aesni_enc1) |
54b6a1bd HY |
1974 | movaps (KEYP), KEY # key |
1975 | mov KEYP, TKEYP | |
1976 | pxor KEY, STATE # round 0 | |
1977 | add $0x30, TKEYP | |
1978 | cmp $24, KLEN | |
1979 | jb .Lenc128 | |
1980 | lea 0x20(TKEYP), TKEYP | |
1981 | je .Lenc192 | |
1982 | add $0x20, TKEYP | |
1983 | movaps -0x60(TKEYP), KEY | |
d7866e50 | 1984 | aesenc KEY, STATE |
54b6a1bd | 1985 | movaps -0x50(TKEYP), KEY |
d7866e50 | 1986 | aesenc KEY, STATE |
54b6a1bd HY |
1987 | .align 4 |
1988 | .Lenc192: | |
1989 | movaps -0x40(TKEYP), KEY | |
d7866e50 | 1990 | aesenc KEY, STATE |
54b6a1bd | 1991 | movaps -0x30(TKEYP), KEY |
d7866e50 | 1992 | aesenc KEY, STATE |
54b6a1bd HY |
1993 | .align 4 |
1994 | .Lenc128: | |
1995 | movaps -0x20(TKEYP), KEY | |
d7866e50 | 1996 | aesenc KEY, STATE |
54b6a1bd | 1997 | movaps -0x10(TKEYP), KEY |
d7866e50 | 1998 | aesenc KEY, STATE |
54b6a1bd | 1999 | movaps (TKEYP), KEY |
d7866e50 | 2000 | aesenc KEY, STATE |
54b6a1bd | 2001 | movaps 0x10(TKEYP), KEY |
d7866e50 | 2002 | aesenc KEY, STATE |
54b6a1bd | 2003 | movaps 0x20(TKEYP), KEY |
d7866e50 | 2004 | aesenc KEY, STATE |
54b6a1bd | 2005 | movaps 0x30(TKEYP), KEY |
d7866e50 | 2006 | aesenc KEY, STATE |
54b6a1bd | 2007 | movaps 0x40(TKEYP), KEY |
d7866e50 | 2008 | aesenc KEY, STATE |
54b6a1bd | 2009 | movaps 0x50(TKEYP), KEY |
d7866e50 | 2010 | aesenc KEY, STATE |
54b6a1bd | 2011 | movaps 0x60(TKEYP), KEY |
d7866e50 | 2012 | aesenc KEY, STATE |
54b6a1bd | 2013 | movaps 0x70(TKEYP), KEY |
d7866e50 | 2014 | aesenclast KEY, STATE |
f94909ce | 2015 | RET |
74d8b90a | 2016 | SYM_FUNC_END(_aesni_enc1) |
54b6a1bd HY |
2017 | |
2018 | /* | |
2019 | * _aesni_enc4: internal ABI | |
2020 | * input: | |
2021 | * KEYP: key struct pointer | |
2022 | * KLEN: round count | |
2023 | * STATE1: initial state (input) | |
2024 | * STATE2 | |
2025 | * STATE3 | |
2026 | * STATE4 | |
2027 | * output: | |
2028 | * STATE1: finial state (output) | |
2029 | * STATE2 | |
2030 | * STATE3 | |
2031 | * STATE4 | |
2032 | * changed: | |
2033 | * KEY | |
2034 | * TKEYP (T1) | |
2035 | */ | |
74d8b90a | 2036 | SYM_FUNC_START_LOCAL(_aesni_enc4) |
54b6a1bd HY |
2037 | movaps (KEYP), KEY # key |
2038 | mov KEYP, TKEYP | |
2039 | pxor KEY, STATE1 # round 0 | |
2040 | pxor KEY, STATE2 | |
2041 | pxor KEY, STATE3 | |
2042 | pxor KEY, STATE4 | |
2043 | add $0x30, TKEYP | |
2044 | cmp $24, KLEN | |
2045 | jb .L4enc128 | |
2046 | lea 0x20(TKEYP), TKEYP | |
2047 | je .L4enc192 | |
2048 | add $0x20, TKEYP | |
2049 | movaps -0x60(TKEYP), KEY | |
d7866e50 UB |
2050 | aesenc KEY, STATE1 |
2051 | aesenc KEY, STATE2 | |
2052 | aesenc KEY, STATE3 | |
2053 | aesenc KEY, STATE4 | |
54b6a1bd | 2054 | movaps -0x50(TKEYP), KEY |
d7866e50 UB |
2055 | aesenc KEY, STATE1 |
2056 | aesenc KEY, STATE2 | |
2057 | aesenc KEY, STATE3 | |
2058 | aesenc KEY, STATE4 | |
54b6a1bd HY |
2059 | #.align 4 |
2060 | .L4enc192: | |
2061 | movaps -0x40(TKEYP), KEY | |
d7866e50 UB |
2062 | aesenc KEY, STATE1 |
2063 | aesenc KEY, STATE2 | |
2064 | aesenc KEY, STATE3 | |
2065 | aesenc KEY, STATE4 | |
54b6a1bd | 2066 | movaps -0x30(TKEYP), KEY |
d7866e50 UB |
2067 | aesenc KEY, STATE1 |
2068 | aesenc KEY, STATE2 | |
2069 | aesenc KEY, STATE3 | |
2070 | aesenc KEY, STATE4 | |
54b6a1bd HY |
2071 | #.align 4 |
2072 | .L4enc128: | |
2073 | movaps -0x20(TKEYP), KEY | |
d7866e50 UB |
2074 | aesenc KEY, STATE1 |
2075 | aesenc KEY, STATE2 | |
2076 | aesenc KEY, STATE3 | |
2077 | aesenc KEY, STATE4 | |
54b6a1bd | 2078 | movaps -0x10(TKEYP), KEY |
d7866e50 UB |
2079 | aesenc KEY, STATE1 |
2080 | aesenc KEY, STATE2 | |
2081 | aesenc KEY, STATE3 | |
2082 | aesenc KEY, STATE4 | |
54b6a1bd | 2083 | movaps (TKEYP), KEY |
d7866e50 UB |
2084 | aesenc KEY, STATE1 |
2085 | aesenc KEY, STATE2 | |
2086 | aesenc KEY, STATE3 | |
2087 | aesenc KEY, STATE4 | |
54b6a1bd | 2088 | movaps 0x10(TKEYP), KEY |
d7866e50 UB |
2089 | aesenc KEY, STATE1 |
2090 | aesenc KEY, STATE2 | |
2091 | aesenc KEY, STATE3 | |
2092 | aesenc KEY, STATE4 | |
54b6a1bd | 2093 | movaps 0x20(TKEYP), KEY |
d7866e50 UB |
2094 | aesenc KEY, STATE1 |
2095 | aesenc KEY, STATE2 | |
2096 | aesenc KEY, STATE3 | |
2097 | aesenc KEY, STATE4 | |
54b6a1bd | 2098 | movaps 0x30(TKEYP), KEY |
d7866e50 UB |
2099 | aesenc KEY, STATE1 |
2100 | aesenc KEY, STATE2 | |
2101 | aesenc KEY, STATE3 | |
2102 | aesenc KEY, STATE4 | |
54b6a1bd | 2103 | movaps 0x40(TKEYP), KEY |
d7866e50 UB |
2104 | aesenc KEY, STATE1 |
2105 | aesenc KEY, STATE2 | |
2106 | aesenc KEY, STATE3 | |
2107 | aesenc KEY, STATE4 | |
54b6a1bd | 2108 | movaps 0x50(TKEYP), KEY |
d7866e50 UB |
2109 | aesenc KEY, STATE1 |
2110 | aesenc KEY, STATE2 | |
2111 | aesenc KEY, STATE3 | |
2112 | aesenc KEY, STATE4 | |
54b6a1bd | 2113 | movaps 0x60(TKEYP), KEY |
d7866e50 UB |
2114 | aesenc KEY, STATE1 |
2115 | aesenc KEY, STATE2 | |
2116 | aesenc KEY, STATE3 | |
2117 | aesenc KEY, STATE4 | |
54b6a1bd | 2118 | movaps 0x70(TKEYP), KEY |
d7866e50 UB |
2119 | aesenclast KEY, STATE1 # last round |
2120 | aesenclast KEY, STATE2 | |
2121 | aesenclast KEY, STATE3 | |
2122 | aesenclast KEY, STATE4 | |
f94909ce | 2123 | RET |
74d8b90a | 2124 | SYM_FUNC_END(_aesni_enc4) |
54b6a1bd HY |
2125 | |
2126 | /* | |
9c1e8836 | 2127 | * void aesni_dec (const void *ctx, u8 *dst, const u8 *src) |
54b6a1bd | 2128 | */ |
6dcc5627 | 2129 | SYM_FUNC_START(aesni_dec) |
8691ccd7 | 2130 | FRAME_BEGIN |
0d258efb MK |
2131 | #ifndef __x86_64__ |
2132 | pushl KEYP | |
2133 | pushl KLEN | |
8691ccd7 JP |
2134 | movl (FRAME_OFFSET+12)(%esp), KEYP # ctx |
2135 | movl (FRAME_OFFSET+16)(%esp), OUTP # dst | |
2136 | movl (FRAME_OFFSET+20)(%esp), INP # src | |
0d258efb | 2137 | #endif |
54b6a1bd HY |
2138 | mov 480(KEYP), KLEN # key length |
2139 | add $240, KEYP | |
2140 | movups (INP), STATE # input | |
2141 | call _aesni_dec1 | |
2142 | movups STATE, (OUTP) #output | |
0d258efb MK |
2143 | #ifndef __x86_64__ |
2144 | popl KLEN | |
2145 | popl KEYP | |
2146 | #endif | |
8691ccd7 | 2147 | FRAME_END |
f94909ce | 2148 | RET |
6dcc5627 | 2149 | SYM_FUNC_END(aesni_dec) |
54b6a1bd HY |
2150 | |
2151 | /* | |
2152 | * _aesni_dec1: internal ABI | |
2153 | * input: | |
2154 | * KEYP: key struct pointer | |
2155 | * KLEN: key length | |
2156 | * STATE: initial state (input) | |
2157 | * output: | |
2158 | * STATE: finial state (output) | |
2159 | * changed: | |
2160 | * KEY | |
2161 | * TKEYP (T1) | |
2162 | */ | |
74d8b90a | 2163 | SYM_FUNC_START_LOCAL(_aesni_dec1) |
54b6a1bd HY |
2164 | movaps (KEYP), KEY # key |
2165 | mov KEYP, TKEYP | |
2166 | pxor KEY, STATE # round 0 | |
2167 | add $0x30, TKEYP | |
2168 | cmp $24, KLEN | |
2169 | jb .Ldec128 | |
2170 | lea 0x20(TKEYP), TKEYP | |
2171 | je .Ldec192 | |
2172 | add $0x20, TKEYP | |
2173 | movaps -0x60(TKEYP), KEY | |
d7866e50 | 2174 | aesdec KEY, STATE |
54b6a1bd | 2175 | movaps -0x50(TKEYP), KEY |
d7866e50 | 2176 | aesdec KEY, STATE |
54b6a1bd HY |
2177 | .align 4 |
2178 | .Ldec192: | |
2179 | movaps -0x40(TKEYP), KEY | |
d7866e50 | 2180 | aesdec KEY, STATE |
54b6a1bd | 2181 | movaps -0x30(TKEYP), KEY |
d7866e50 | 2182 | aesdec KEY, STATE |
54b6a1bd HY |
2183 | .align 4 |
2184 | .Ldec128: | |
2185 | movaps -0x20(TKEYP), KEY | |
d7866e50 | 2186 | aesdec KEY, STATE |
54b6a1bd | 2187 | movaps -0x10(TKEYP), KEY |
d7866e50 | 2188 | aesdec KEY, STATE |
54b6a1bd | 2189 | movaps (TKEYP), KEY |
d7866e50 | 2190 | aesdec KEY, STATE |
54b6a1bd | 2191 | movaps 0x10(TKEYP), KEY |
d7866e50 | 2192 | aesdec KEY, STATE |
54b6a1bd | 2193 | movaps 0x20(TKEYP), KEY |
d7866e50 | 2194 | aesdec KEY, STATE |
54b6a1bd | 2195 | movaps 0x30(TKEYP), KEY |
d7866e50 | 2196 | aesdec KEY, STATE |
54b6a1bd | 2197 | movaps 0x40(TKEYP), KEY |
d7866e50 | 2198 | aesdec KEY, STATE |
54b6a1bd | 2199 | movaps 0x50(TKEYP), KEY |
d7866e50 | 2200 | aesdec KEY, STATE |
54b6a1bd | 2201 | movaps 0x60(TKEYP), KEY |
d7866e50 | 2202 | aesdec KEY, STATE |
54b6a1bd | 2203 | movaps 0x70(TKEYP), KEY |
d7866e50 | 2204 | aesdeclast KEY, STATE |
f94909ce | 2205 | RET |
74d8b90a | 2206 | SYM_FUNC_END(_aesni_dec1) |
54b6a1bd HY |
2207 | |
2208 | /* | |
2209 | * _aesni_dec4: internal ABI | |
2210 | * input: | |
2211 | * KEYP: key struct pointer | |
2212 | * KLEN: key length | |
2213 | * STATE1: initial state (input) | |
2214 | * STATE2 | |
2215 | * STATE3 | |
2216 | * STATE4 | |
2217 | * output: | |
2218 | * STATE1: finial state (output) | |
2219 | * STATE2 | |
2220 | * STATE3 | |
2221 | * STATE4 | |
2222 | * changed: | |
2223 | * KEY | |
2224 | * TKEYP (T1) | |
2225 | */ | |
74d8b90a | 2226 | SYM_FUNC_START_LOCAL(_aesni_dec4) |
54b6a1bd HY |
2227 | movaps (KEYP), KEY # key |
2228 | mov KEYP, TKEYP | |
2229 | pxor KEY, STATE1 # round 0 | |
2230 | pxor KEY, STATE2 | |
2231 | pxor KEY, STATE3 | |
2232 | pxor KEY, STATE4 | |
2233 | add $0x30, TKEYP | |
2234 | cmp $24, KLEN | |
2235 | jb .L4dec128 | |
2236 | lea 0x20(TKEYP), TKEYP | |
2237 | je .L4dec192 | |
2238 | add $0x20, TKEYP | |
2239 | movaps -0x60(TKEYP), KEY | |
d7866e50 UB |
2240 | aesdec KEY, STATE1 |
2241 | aesdec KEY, STATE2 | |
2242 | aesdec KEY, STATE3 | |
2243 | aesdec KEY, STATE4 | |
54b6a1bd | 2244 | movaps -0x50(TKEYP), KEY |
d7866e50 UB |
2245 | aesdec KEY, STATE1 |
2246 | aesdec KEY, STATE2 | |
2247 | aesdec KEY, STATE3 | |
2248 | aesdec KEY, STATE4 | |
54b6a1bd HY |
2249 | .align 4 |
2250 | .L4dec192: | |
2251 | movaps -0x40(TKEYP), KEY | |
d7866e50 UB |
2252 | aesdec KEY, STATE1 |
2253 | aesdec KEY, STATE2 | |
2254 | aesdec KEY, STATE3 | |
2255 | aesdec KEY, STATE4 | |
54b6a1bd | 2256 | movaps -0x30(TKEYP), KEY |
d7866e50 UB |
2257 | aesdec KEY, STATE1 |
2258 | aesdec KEY, STATE2 | |
2259 | aesdec KEY, STATE3 | |
2260 | aesdec KEY, STATE4 | |
54b6a1bd HY |
2261 | .align 4 |
2262 | .L4dec128: | |
2263 | movaps -0x20(TKEYP), KEY | |
d7866e50 UB |
2264 | aesdec KEY, STATE1 |
2265 | aesdec KEY, STATE2 | |
2266 | aesdec KEY, STATE3 | |
2267 | aesdec KEY, STATE4 | |
54b6a1bd | 2268 | movaps -0x10(TKEYP), KEY |
d7866e50 UB |
2269 | aesdec KEY, STATE1 |
2270 | aesdec KEY, STATE2 | |
2271 | aesdec KEY, STATE3 | |
2272 | aesdec KEY, STATE4 | |
54b6a1bd | 2273 | movaps (TKEYP), KEY |
d7866e50 UB |
2274 | aesdec KEY, STATE1 |
2275 | aesdec KEY, STATE2 | |
2276 | aesdec KEY, STATE3 | |
2277 | aesdec KEY, STATE4 | |
54b6a1bd | 2278 | movaps 0x10(TKEYP), KEY |
d7866e50 UB |
2279 | aesdec KEY, STATE1 |
2280 | aesdec KEY, STATE2 | |
2281 | aesdec KEY, STATE3 | |
2282 | aesdec KEY, STATE4 | |
54b6a1bd | 2283 | movaps 0x20(TKEYP), KEY |
d7866e50 UB |
2284 | aesdec KEY, STATE1 |
2285 | aesdec KEY, STATE2 | |
2286 | aesdec KEY, STATE3 | |
2287 | aesdec KEY, STATE4 | |
54b6a1bd | 2288 | movaps 0x30(TKEYP), KEY |
d7866e50 UB |
2289 | aesdec KEY, STATE1 |
2290 | aesdec KEY, STATE2 | |
2291 | aesdec KEY, STATE3 | |
2292 | aesdec KEY, STATE4 | |
54b6a1bd | 2293 | movaps 0x40(TKEYP), KEY |
d7866e50 UB |
2294 | aesdec KEY, STATE1 |
2295 | aesdec KEY, STATE2 | |
2296 | aesdec KEY, STATE3 | |
2297 | aesdec KEY, STATE4 | |
54b6a1bd | 2298 | movaps 0x50(TKEYP), KEY |
d7866e50 UB |
2299 | aesdec KEY, STATE1 |
2300 | aesdec KEY, STATE2 | |
2301 | aesdec KEY, STATE3 | |
2302 | aesdec KEY, STATE4 | |
54b6a1bd | 2303 | movaps 0x60(TKEYP), KEY |
d7866e50 UB |
2304 | aesdec KEY, STATE1 |
2305 | aesdec KEY, STATE2 | |
2306 | aesdec KEY, STATE3 | |
2307 | aesdec KEY, STATE4 | |
54b6a1bd | 2308 | movaps 0x70(TKEYP), KEY |
d7866e50 UB |
2309 | aesdeclast KEY, STATE1 # last round |
2310 | aesdeclast KEY, STATE2 | |
2311 | aesdeclast KEY, STATE3 | |
2312 | aesdeclast KEY, STATE4 | |
f94909ce | 2313 | RET |
74d8b90a | 2314 | SYM_FUNC_END(_aesni_dec4) |
54b6a1bd HY |
2315 | |
2316 | /* | |
2317 | * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | |
2318 | * size_t len) | |
2319 | */ | |
6dcc5627 | 2320 | SYM_FUNC_START(aesni_ecb_enc) |
8691ccd7 | 2321 | FRAME_BEGIN |
0d258efb MK |
2322 | #ifndef __x86_64__ |
2323 | pushl LEN | |
2324 | pushl KEYP | |
2325 | pushl KLEN | |
8691ccd7 JP |
2326 | movl (FRAME_OFFSET+16)(%esp), KEYP # ctx |
2327 | movl (FRAME_OFFSET+20)(%esp), OUTP # dst | |
2328 | movl (FRAME_OFFSET+24)(%esp), INP # src | |
2329 | movl (FRAME_OFFSET+28)(%esp), LEN # len | |
0d258efb | 2330 | #endif |
54b6a1bd HY |
2331 | test LEN, LEN # check length |
2332 | jz .Lecb_enc_ret | |
2333 | mov 480(KEYP), KLEN | |
2334 | cmp $16, LEN | |
2335 | jb .Lecb_enc_ret | |
2336 | cmp $64, LEN | |
2337 | jb .Lecb_enc_loop1 | |
2338 | .align 4 | |
2339 | .Lecb_enc_loop4: | |
2340 | movups (INP), STATE1 | |
2341 | movups 0x10(INP), STATE2 | |
2342 | movups 0x20(INP), STATE3 | |
2343 | movups 0x30(INP), STATE4 | |
2344 | call _aesni_enc4 | |
2345 | movups STATE1, (OUTP) | |
2346 | movups STATE2, 0x10(OUTP) | |
2347 | movups STATE3, 0x20(OUTP) | |
2348 | movups STATE4, 0x30(OUTP) | |
2349 | sub $64, LEN | |
2350 | add $64, INP | |
2351 | add $64, OUTP | |
2352 | cmp $64, LEN | |
2353 | jge .Lecb_enc_loop4 | |
2354 | cmp $16, LEN | |
2355 | jb .Lecb_enc_ret | |
2356 | .align 4 | |
2357 | .Lecb_enc_loop1: | |
2358 | movups (INP), STATE1 | |
2359 | call _aesni_enc1 | |
2360 | movups STATE1, (OUTP) | |
2361 | sub $16, LEN | |
2362 | add $16, INP | |
2363 | add $16, OUTP | |
2364 | cmp $16, LEN | |
2365 | jge .Lecb_enc_loop1 | |
2366 | .Lecb_enc_ret: | |
0d258efb MK |
2367 | #ifndef __x86_64__ |
2368 | popl KLEN | |
2369 | popl KEYP | |
2370 | popl LEN | |
2371 | #endif | |
8691ccd7 | 2372 | FRAME_END |
f94909ce | 2373 | RET |
6dcc5627 | 2374 | SYM_FUNC_END(aesni_ecb_enc) |
54b6a1bd HY |
2375 | |
2376 | /* | |
2377 | * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | |
2378 | * size_t len); | |
2379 | */ | |
6dcc5627 | 2380 | SYM_FUNC_START(aesni_ecb_dec) |
8691ccd7 | 2381 | FRAME_BEGIN |
0d258efb MK |
2382 | #ifndef __x86_64__ |
2383 | pushl LEN | |
2384 | pushl KEYP | |
2385 | pushl KLEN | |
8691ccd7 JP |
2386 | movl (FRAME_OFFSET+16)(%esp), KEYP # ctx |
2387 | movl (FRAME_OFFSET+20)(%esp), OUTP # dst | |
2388 | movl (FRAME_OFFSET+24)(%esp), INP # src | |
2389 | movl (FRAME_OFFSET+28)(%esp), LEN # len | |
0d258efb | 2390 | #endif |
54b6a1bd HY |
2391 | test LEN, LEN |
2392 | jz .Lecb_dec_ret | |
2393 | mov 480(KEYP), KLEN | |
2394 | add $240, KEYP | |
2395 | cmp $16, LEN | |
2396 | jb .Lecb_dec_ret | |
2397 | cmp $64, LEN | |
2398 | jb .Lecb_dec_loop1 | |
2399 | .align 4 | |
2400 | .Lecb_dec_loop4: | |
2401 | movups (INP), STATE1 | |
2402 | movups 0x10(INP), STATE2 | |
2403 | movups 0x20(INP), STATE3 | |
2404 | movups 0x30(INP), STATE4 | |
2405 | call _aesni_dec4 | |
2406 | movups STATE1, (OUTP) | |
2407 | movups STATE2, 0x10(OUTP) | |
2408 | movups STATE3, 0x20(OUTP) | |
2409 | movups STATE4, 0x30(OUTP) | |
2410 | sub $64, LEN | |
2411 | add $64, INP | |
2412 | add $64, OUTP | |
2413 | cmp $64, LEN | |
2414 | jge .Lecb_dec_loop4 | |
2415 | cmp $16, LEN | |
2416 | jb .Lecb_dec_ret | |
2417 | .align 4 | |
2418 | .Lecb_dec_loop1: | |
2419 | movups (INP), STATE1 | |
2420 | call _aesni_dec1 | |
2421 | movups STATE1, (OUTP) | |
2422 | sub $16, LEN | |
2423 | add $16, INP | |
2424 | add $16, OUTP | |
2425 | cmp $16, LEN | |
2426 | jge .Lecb_dec_loop1 | |
2427 | .Lecb_dec_ret: | |
0d258efb MK |
2428 | #ifndef __x86_64__ |
2429 | popl KLEN | |
2430 | popl KEYP | |
2431 | popl LEN | |
2432 | #endif | |
8691ccd7 | 2433 | FRAME_END |
f94909ce | 2434 | RET |
6dcc5627 | 2435 | SYM_FUNC_END(aesni_ecb_dec) |
54b6a1bd HY |
2436 | |
2437 | /* | |
2438 | * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | |
2439 | * size_t len, u8 *iv) | |
2440 | */ | |
6dcc5627 | 2441 | SYM_FUNC_START(aesni_cbc_enc) |
8691ccd7 | 2442 | FRAME_BEGIN |
0d258efb MK |
2443 | #ifndef __x86_64__ |
2444 | pushl IVP | |
2445 | pushl LEN | |
2446 | pushl KEYP | |
2447 | pushl KLEN | |
8691ccd7 JP |
2448 | movl (FRAME_OFFSET+20)(%esp), KEYP # ctx |
2449 | movl (FRAME_OFFSET+24)(%esp), OUTP # dst | |
2450 | movl (FRAME_OFFSET+28)(%esp), INP # src | |
2451 | movl (FRAME_OFFSET+32)(%esp), LEN # len | |
2452 | movl (FRAME_OFFSET+36)(%esp), IVP # iv | |
0d258efb | 2453 | #endif |
54b6a1bd HY |
2454 | cmp $16, LEN |
2455 | jb .Lcbc_enc_ret | |
2456 | mov 480(KEYP), KLEN | |
2457 | movups (IVP), STATE # load iv as initial state | |
2458 | .align 4 | |
2459 | .Lcbc_enc_loop: | |
2460 | movups (INP), IN # load input | |
2461 | pxor IN, STATE | |
2462 | call _aesni_enc1 | |
2463 | movups STATE, (OUTP) # store output | |
2464 | sub $16, LEN | |
2465 | add $16, INP | |
2466 | add $16, OUTP | |
2467 | cmp $16, LEN | |
2468 | jge .Lcbc_enc_loop | |
2469 | movups STATE, (IVP) | |
2470 | .Lcbc_enc_ret: | |
0d258efb MK |
2471 | #ifndef __x86_64__ |
2472 | popl KLEN | |
2473 | popl KEYP | |
2474 | popl LEN | |
2475 | popl IVP | |
2476 | #endif | |
8691ccd7 | 2477 | FRAME_END |
f94909ce | 2478 | RET |
6dcc5627 | 2479 | SYM_FUNC_END(aesni_cbc_enc) |
54b6a1bd HY |
2480 | |
2481 | /* | |
2482 | * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | |
2483 | * size_t len, u8 *iv) | |
2484 | */ | |
6dcc5627 | 2485 | SYM_FUNC_START(aesni_cbc_dec) |
8691ccd7 | 2486 | FRAME_BEGIN |
0d258efb MK |
2487 | #ifndef __x86_64__ |
2488 | pushl IVP | |
2489 | pushl LEN | |
2490 | pushl KEYP | |
2491 | pushl KLEN | |
8691ccd7 JP |
2492 | movl (FRAME_OFFSET+20)(%esp), KEYP # ctx |
2493 | movl (FRAME_OFFSET+24)(%esp), OUTP # dst | |
2494 | movl (FRAME_OFFSET+28)(%esp), INP # src | |
2495 | movl (FRAME_OFFSET+32)(%esp), LEN # len | |
2496 | movl (FRAME_OFFSET+36)(%esp), IVP # iv | |
0d258efb | 2497 | #endif |
54b6a1bd | 2498 | cmp $16, LEN |
e6efaa02 | 2499 | jb .Lcbc_dec_just_ret |
54b6a1bd HY |
2500 | mov 480(KEYP), KLEN |
2501 | add $240, KEYP | |
2502 | movups (IVP), IV | |
2503 | cmp $64, LEN | |
2504 | jb .Lcbc_dec_loop1 | |
2505 | .align 4 | |
2506 | .Lcbc_dec_loop4: | |
2507 | movups (INP), IN1 | |
2508 | movaps IN1, STATE1 | |
2509 | movups 0x10(INP), IN2 | |
2510 | movaps IN2, STATE2 | |
0d258efb | 2511 | #ifdef __x86_64__ |
54b6a1bd HY |
2512 | movups 0x20(INP), IN3 |
2513 | movaps IN3, STATE3 | |
2514 | movups 0x30(INP), IN4 | |
2515 | movaps IN4, STATE4 | |
0d258efb MK |
2516 | #else |
2517 | movups 0x20(INP), IN1 | |
2518 | movaps IN1, STATE3 | |
2519 | movups 0x30(INP), IN2 | |
2520 | movaps IN2, STATE4 | |
2521 | #endif | |
54b6a1bd HY |
2522 | call _aesni_dec4 |
2523 | pxor IV, STATE1 | |
0d258efb | 2524 | #ifdef __x86_64__ |
54b6a1bd HY |
2525 | pxor IN1, STATE2 |
2526 | pxor IN2, STATE3 | |
2527 | pxor IN3, STATE4 | |
2528 | movaps IN4, IV | |
0d258efb | 2529 | #else |
0d258efb MK |
2530 | pxor IN1, STATE4 |
2531 | movaps IN2, IV | |
7c8d5184 MK |
2532 | movups (INP), IN1 |
2533 | pxor IN1, STATE2 | |
2534 | movups 0x10(INP), IN2 | |
2535 | pxor IN2, STATE3 | |
0d258efb | 2536 | #endif |
54b6a1bd HY |
2537 | movups STATE1, (OUTP) |
2538 | movups STATE2, 0x10(OUTP) | |
2539 | movups STATE3, 0x20(OUTP) | |
2540 | movups STATE4, 0x30(OUTP) | |
2541 | sub $64, LEN | |
2542 | add $64, INP | |
2543 | add $64, OUTP | |
2544 | cmp $64, LEN | |
2545 | jge .Lcbc_dec_loop4 | |
2546 | cmp $16, LEN | |
2547 | jb .Lcbc_dec_ret | |
2548 | .align 4 | |
2549 | .Lcbc_dec_loop1: | |
2550 | movups (INP), IN | |
2551 | movaps IN, STATE | |
2552 | call _aesni_dec1 | |
2553 | pxor IV, STATE | |
2554 | movups STATE, (OUTP) | |
2555 | movaps IN, IV | |
2556 | sub $16, LEN | |
2557 | add $16, INP | |
2558 | add $16, OUTP | |
2559 | cmp $16, LEN | |
2560 | jge .Lcbc_dec_loop1 | |
54b6a1bd | 2561 | .Lcbc_dec_ret: |
e6efaa02 HY |
2562 | movups IV, (IVP) |
2563 | .Lcbc_dec_just_ret: | |
0d258efb MK |
2564 | #ifndef __x86_64__ |
2565 | popl KLEN | |
2566 | popl KEYP | |
2567 | popl LEN | |
2568 | popl IVP | |
2569 | #endif | |
8691ccd7 | 2570 | FRAME_END |
f94909ce | 2571 | RET |
6dcc5627 | 2572 | SYM_FUNC_END(aesni_cbc_dec) |
12387a46 | 2573 | |
ddf169a9 AB |
2574 | /* |
2575 | * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | |
2576 | * size_t len, u8 *iv) | |
2577 | */ | |
2578 | SYM_FUNC_START(aesni_cts_cbc_enc) | |
2579 | FRAME_BEGIN | |
2580 | #ifndef __x86_64__ | |
2581 | pushl IVP | |
2582 | pushl LEN | |
2583 | pushl KEYP | |
2584 | pushl KLEN | |
2585 | movl (FRAME_OFFSET+20)(%esp), KEYP # ctx | |
2586 | movl (FRAME_OFFSET+24)(%esp), OUTP # dst | |
2587 | movl (FRAME_OFFSET+28)(%esp), INP # src | |
2588 | movl (FRAME_OFFSET+32)(%esp), LEN # len | |
2589 | movl (FRAME_OFFSET+36)(%esp), IVP # iv | |
2590 | lea .Lcts_permute_table, T1 | |
2591 | #else | |
2592 | lea .Lcts_permute_table(%rip), T1 | |
2593 | #endif | |
2594 | mov 480(KEYP), KLEN | |
2595 | movups (IVP), STATE | |
2596 | sub $16, LEN | |
2597 | mov T1, IVP | |
2598 | add $32, IVP | |
2599 | add LEN, T1 | |
2600 | sub LEN, IVP | |
2601 | movups (T1), %xmm4 | |
2602 | movups (IVP), %xmm5 | |
2603 | ||
2604 | movups (INP), IN1 | |
2605 | add LEN, INP | |
2606 | movups (INP), IN2 | |
2607 | ||
2608 | pxor IN1, STATE | |
2609 | call _aesni_enc1 | |
2610 | ||
2611 | pshufb %xmm5, IN2 | |
2612 | pxor STATE, IN2 | |
2613 | pshufb %xmm4, STATE | |
2614 | add OUTP, LEN | |
2615 | movups STATE, (LEN) | |
2616 | ||
2617 | movaps IN2, STATE | |
2618 | call _aesni_enc1 | |
2619 | movups STATE, (OUTP) | |
2620 | ||
2621 | #ifndef __x86_64__ | |
2622 | popl KLEN | |
2623 | popl KEYP | |
2624 | popl LEN | |
2625 | popl IVP | |
2626 | #endif | |
2627 | FRAME_END | |
f94909ce | 2628 | RET |
ddf169a9 AB |
2629 | SYM_FUNC_END(aesni_cts_cbc_enc) |
2630 | ||
2631 | /* | |
2632 | * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | |
2633 | * size_t len, u8 *iv) | |
2634 | */ | |
2635 | SYM_FUNC_START(aesni_cts_cbc_dec) | |
2636 | FRAME_BEGIN | |
2637 | #ifndef __x86_64__ | |
2638 | pushl IVP | |
2639 | pushl LEN | |
2640 | pushl KEYP | |
2641 | pushl KLEN | |
2642 | movl (FRAME_OFFSET+20)(%esp), KEYP # ctx | |
2643 | movl (FRAME_OFFSET+24)(%esp), OUTP # dst | |
2644 | movl (FRAME_OFFSET+28)(%esp), INP # src | |
2645 | movl (FRAME_OFFSET+32)(%esp), LEN # len | |
2646 | movl (FRAME_OFFSET+36)(%esp), IVP # iv | |
2647 | lea .Lcts_permute_table, T1 | |
2648 | #else | |
2649 | lea .Lcts_permute_table(%rip), T1 | |
2650 | #endif | |
2651 | mov 480(KEYP), KLEN | |
2652 | add $240, KEYP | |
2653 | movups (IVP), IV | |
2654 | sub $16, LEN | |
2655 | mov T1, IVP | |
2656 | add $32, IVP | |
2657 | add LEN, T1 | |
2658 | sub LEN, IVP | |
2659 | movups (T1), %xmm4 | |
2660 | ||
2661 | movups (INP), STATE | |
2662 | add LEN, INP | |
2663 | movups (INP), IN1 | |
2664 | ||
2665 | call _aesni_dec1 | |
2666 | movaps STATE, IN2 | |
2667 | pshufb %xmm4, STATE | |
2668 | pxor IN1, STATE | |
2669 | ||
2670 | add OUTP, LEN | |
2671 | movups STATE, (LEN) | |
2672 | ||
2673 | movups (IVP), %xmm0 | |
2674 | pshufb %xmm0, IN1 | |
2675 | pblendvb IN2, IN1 | |
2676 | movaps IN1, STATE | |
2677 | call _aesni_dec1 | |
2678 | ||
2679 | pxor IV, STATE | |
2680 | movups STATE, (OUTP) | |
2681 | ||
2682 | #ifndef __x86_64__ | |
2683 | popl KLEN | |
2684 | popl KEYP | |
2685 | popl LEN | |
2686 | popl IVP | |
2687 | #endif | |
2688 | FRAME_END | |
f94909ce | 2689 | RET |
ddf169a9 AB |
2690 | SYM_FUNC_END(aesni_cts_cbc_dec) |
2691 | ||
1253cab8 | 2692 | .pushsection .rodata |
12387a46 | 2693 | .align 16 |
ddf169a9 AB |
2694 | .Lcts_permute_table: |
2695 | .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 | |
2696 | .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 | |
2697 | .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 | |
2698 | .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f | |
2699 | .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 | |
2700 | .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 | |
2701 | #ifdef __x86_64__ | |
12387a46 HY |
2702 | .Lbswap_mask: |
2703 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | |
ddf169a9 | 2704 | #endif |
1253cab8 | 2705 | .popsection |
12387a46 | 2706 | |
ddf169a9 | 2707 | #ifdef __x86_64__ |
12387a46 HY |
2708 | /* |
2709 | * _aesni_inc_init: internal ABI | |
2710 | * setup registers used by _aesni_inc | |
2711 | * input: | |
2712 | * IV | |
2713 | * output: | |
2714 | * CTR: == IV, in little endian | |
2715 | * TCTR_LOW: == lower qword of CTR | |
2716 | * INC: == 1, in little endian | |
2717 | * BSWAP_MASK == endian swapping mask | |
2718 | */ | |
74d8b90a | 2719 | SYM_FUNC_START_LOCAL(_aesni_inc_init) |
12387a46 HY |
2720 | movaps .Lbswap_mask, BSWAP_MASK |
2721 | movaps IV, CTR | |
d7866e50 | 2722 | pshufb BSWAP_MASK, CTR |
12387a46 | 2723 | mov $1, TCTR_LOW |
d7866e50 UB |
2724 | movq TCTR_LOW, INC |
2725 | movq CTR, TCTR_LOW | |
f94909ce | 2726 | RET |
74d8b90a | 2727 | SYM_FUNC_END(_aesni_inc_init) |
12387a46 HY |
2728 | |
2729 | /* | |
2730 | * _aesni_inc: internal ABI | |
2731 | * Increase IV by 1, IV is in big endian | |
2732 | * input: | |
2733 | * IV | |
2734 | * CTR: == IV, in little endian | |
2735 | * TCTR_LOW: == lower qword of CTR | |
2736 | * INC: == 1, in little endian | |
2737 | * BSWAP_MASK == endian swapping mask | |
2738 | * output: | |
2739 | * IV: Increase by 1 | |
2740 | * changed: | |
2741 | * CTR: == output IV, in little endian | |
2742 | * TCTR_LOW: == lower qword of CTR | |
2743 | */ | |
74d8b90a | 2744 | SYM_FUNC_START_LOCAL(_aesni_inc) |
12387a46 HY |
2745 | paddq INC, CTR |
2746 | add $1, TCTR_LOW | |
2747 | jnc .Linc_low | |
2748 | pslldq $8, INC | |
2749 | paddq INC, CTR | |
2750 | psrldq $8, INC | |
2751 | .Linc_low: | |
2752 | movaps CTR, IV | |
d7866e50 | 2753 | pshufb BSWAP_MASK, IV |
f94909ce | 2754 | RET |
74d8b90a | 2755 | SYM_FUNC_END(_aesni_inc) |
12387a46 HY |
2756 | |
2757 | /* | |
2758 | * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | |
2759 | * size_t len, u8 *iv) | |
2760 | */ | |
6dcc5627 | 2761 | SYM_FUNC_START(aesni_ctr_enc) |
8691ccd7 | 2762 | FRAME_BEGIN |
12387a46 HY |
2763 | cmp $16, LEN |
2764 | jb .Lctr_enc_just_ret | |
2765 | mov 480(KEYP), KLEN | |
2766 | movups (IVP), IV | |
2767 | call _aesni_inc_init | |
2768 | cmp $64, LEN | |
2769 | jb .Lctr_enc_loop1 | |
2770 | .align 4 | |
2771 | .Lctr_enc_loop4: | |
2772 | movaps IV, STATE1 | |
2773 | call _aesni_inc | |
2774 | movups (INP), IN1 | |
2775 | movaps IV, STATE2 | |
2776 | call _aesni_inc | |
2777 | movups 0x10(INP), IN2 | |
2778 | movaps IV, STATE3 | |
2779 | call _aesni_inc | |
2780 | movups 0x20(INP), IN3 | |
2781 | movaps IV, STATE4 | |
2782 | call _aesni_inc | |
2783 | movups 0x30(INP), IN4 | |
2784 | call _aesni_enc4 | |
2785 | pxor IN1, STATE1 | |
2786 | movups STATE1, (OUTP) | |
2787 | pxor IN2, STATE2 | |
2788 | movups STATE2, 0x10(OUTP) | |
2789 | pxor IN3, STATE3 | |
2790 | movups STATE3, 0x20(OUTP) | |
2791 | pxor IN4, STATE4 | |
2792 | movups STATE4, 0x30(OUTP) | |
2793 | sub $64, LEN | |
2794 | add $64, INP | |
2795 | add $64, OUTP | |
2796 | cmp $64, LEN | |
2797 | jge .Lctr_enc_loop4 | |
2798 | cmp $16, LEN | |
2799 | jb .Lctr_enc_ret | |
2800 | .align 4 | |
2801 | .Lctr_enc_loop1: | |
2802 | movaps IV, STATE | |
2803 | call _aesni_inc | |
2804 | movups (INP), IN | |
2805 | call _aesni_enc1 | |
2806 | pxor IN, STATE | |
2807 | movups STATE, (OUTP) | |
2808 | sub $16, LEN | |
2809 | add $16, INP | |
2810 | add $16, OUTP | |
2811 | cmp $16, LEN | |
2812 | jge .Lctr_enc_loop1 | |
2813 | .Lctr_enc_ret: | |
2814 | movups IV, (IVP) | |
2815 | .Lctr_enc_just_ret: | |
8691ccd7 | 2816 | FRAME_END |
f94909ce | 2817 | RET |
6dcc5627 | 2818 | SYM_FUNC_END(aesni_ctr_enc) |
c456a9cd | 2819 | |
2481104f AB |
2820 | #endif |
2821 | ||
2822 | .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 | |
2823 | .align 16 | |
2824 | .Lgf128mul_x_ble_mask: | |
2825 | .octa 0x00000000000000010000000000000087 | |
2826 | .previous | |
2827 | ||
c456a9cd JK |
2828 | /* |
2829 | * _aesni_gf128mul_x_ble: internal ABI | |
2830 | * Multiply in GF(2^128) for XTS IVs | |
2831 | * input: | |
2832 | * IV: current IV | |
2833 | * GF128MUL_MASK == mask with 0x87 and 0x01 | |
2834 | * output: | |
2835 | * IV: next IV | |
2836 | * changed: | |
2837 | * CTR: == temporary value | |
2838 | */ | |
2839 | #define _aesni_gf128mul_x_ble() \ | |
2481104f | 2840 | pshufd $0x13, IV, KEY; \ |
c456a9cd | 2841 | paddq IV, IV; \ |
2481104f AB |
2842 | psrad $31, KEY; \ |
2843 | pand GF128MUL_MASK, KEY; \ | |
2844 | pxor KEY, IV; | |
c456a9cd JK |
2845 | |
2846 | /* | |
86ad60a6 AB |
2847 | * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, |
2848 | * const u8 *src, unsigned int len, le128 *iv) | |
c456a9cd | 2849 | */ |
86ad60a6 | 2850 | SYM_FUNC_START(aesni_xts_encrypt) |
8691ccd7 | 2851 | FRAME_BEGIN |
2481104f AB |
2852 | #ifndef __x86_64__ |
2853 | pushl IVP | |
2854 | pushl LEN | |
2855 | pushl KEYP | |
2856 | pushl KLEN | |
2857 | movl (FRAME_OFFSET+20)(%esp), KEYP # ctx | |
2858 | movl (FRAME_OFFSET+24)(%esp), OUTP # dst | |
2859 | movl (FRAME_OFFSET+28)(%esp), INP # src | |
2860 | movl (FRAME_OFFSET+32)(%esp), LEN # len | |
2861 | movl (FRAME_OFFSET+36)(%esp), IVP # iv | |
c456a9cd | 2862 | movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK |
2481104f AB |
2863 | #else |
2864 | movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK | |
2865 | #endif | |
c456a9cd JK |
2866 | movups (IVP), IV |
2867 | ||
2868 | mov 480(KEYP), KLEN | |
c456a9cd | 2869 | |
86ad60a6 | 2870 | .Lxts_enc_loop4: |
2481104f AB |
2871 | sub $64, LEN |
2872 | jl .Lxts_enc_1x | |
2873 | ||
c456a9cd | 2874 | movdqa IV, STATE1 |
2481104f AB |
2875 | movdqu 0x00(INP), IN |
2876 | pxor IN, STATE1 | |
c456a9cd JK |
2877 | movdqu IV, 0x00(OUTP) |
2878 | ||
2879 | _aesni_gf128mul_x_ble() | |
2880 | movdqa IV, STATE2 | |
2481104f AB |
2881 | movdqu 0x10(INP), IN |
2882 | pxor IN, STATE2 | |
c456a9cd JK |
2883 | movdqu IV, 0x10(OUTP) |
2884 | ||
2885 | _aesni_gf128mul_x_ble() | |
2886 | movdqa IV, STATE3 | |
2481104f AB |
2887 | movdqu 0x20(INP), IN |
2888 | pxor IN, STATE3 | |
c456a9cd JK |
2889 | movdqu IV, 0x20(OUTP) |
2890 | ||
2891 | _aesni_gf128mul_x_ble() | |
2892 | movdqa IV, STATE4 | |
2481104f AB |
2893 | movdqu 0x30(INP), IN |
2894 | pxor IN, STATE4 | |
c456a9cd JK |
2895 | movdqu IV, 0x30(OUTP) |
2896 | ||
86ad60a6 | 2897 | call _aesni_enc4 |
c456a9cd | 2898 | |
2481104f AB |
2899 | movdqu 0x00(OUTP), IN |
2900 | pxor IN, STATE1 | |
c456a9cd JK |
2901 | movdqu STATE1, 0x00(OUTP) |
2902 | ||
2481104f AB |
2903 | movdqu 0x10(OUTP), IN |
2904 | pxor IN, STATE2 | |
c456a9cd JK |
2905 | movdqu STATE2, 0x10(OUTP) |
2906 | ||
2481104f AB |
2907 | movdqu 0x20(OUTP), IN |
2908 | pxor IN, STATE3 | |
c456a9cd JK |
2909 | movdqu STATE3, 0x20(OUTP) |
2910 | ||
2481104f AB |
2911 | movdqu 0x30(OUTP), IN |
2912 | pxor IN, STATE4 | |
c456a9cd JK |
2913 | movdqu STATE4, 0x30(OUTP) |
2914 | ||
2915 | _aesni_gf128mul_x_ble() | |
c456a9cd | 2916 | |
86ad60a6 AB |
2917 | add $64, INP |
2918 | add $64, OUTP | |
2481104f AB |
2919 | test LEN, LEN |
2920 | jnz .Lxts_enc_loop4 | |
86ad60a6 | 2921 | |
2481104f | 2922 | .Lxts_enc_ret_iv: |
c456a9cd JK |
2923 | movups IV, (IVP) |
2924 | ||
2481104f AB |
2925 | .Lxts_enc_ret: |
2926 | #ifndef __x86_64__ | |
2927 | popl KLEN | |
2928 | popl KEYP | |
2929 | popl LEN | |
2930 | popl IVP | |
2931 | #endif | |
86ad60a6 | 2932 | FRAME_END |
f94909ce | 2933 | RET |
2481104f AB |
2934 | |
2935 | .Lxts_enc_1x: | |
2936 | add $64, LEN | |
2937 | jz .Lxts_enc_ret_iv | |
2938 | sub $16, LEN | |
2939 | jl .Lxts_enc_cts4 | |
2940 | ||
2941 | .Lxts_enc_loop1: | |
2942 | movdqu (INP), STATE | |
2943 | pxor IV, STATE | |
2944 | call _aesni_enc1 | |
2945 | pxor IV, STATE | |
2946 | _aesni_gf128mul_x_ble() | |
2947 | ||
2948 | test LEN, LEN | |
2949 | jz .Lxts_enc_out | |
2950 | ||
2951 | add $16, INP | |
2952 | sub $16, LEN | |
2953 | jl .Lxts_enc_cts1 | |
2954 | ||
2955 | movdqu STATE, (OUTP) | |
2956 | add $16, OUTP | |
2957 | jmp .Lxts_enc_loop1 | |
2958 | ||
2959 | .Lxts_enc_out: | |
2960 | movdqu STATE, (OUTP) | |
2961 | jmp .Lxts_enc_ret_iv | |
2962 | ||
2963 | .Lxts_enc_cts4: | |
2964 | movdqa STATE4, STATE | |
2965 | sub $16, OUTP | |
2966 | ||
2967 | .Lxts_enc_cts1: | |
2968 | #ifndef __x86_64__ | |
2969 | lea .Lcts_permute_table, T1 | |
2970 | #else | |
2971 | lea .Lcts_permute_table(%rip), T1 | |
2972 | #endif | |
2973 | add LEN, INP /* rewind input pointer */ | |
2974 | add $16, LEN /* # bytes in final block */ | |
2975 | movups (INP), IN1 | |
2976 | ||
2977 | mov T1, IVP | |
2978 | add $32, IVP | |
2979 | add LEN, T1 | |
2980 | sub LEN, IVP | |
2981 | add OUTP, LEN | |
2982 | ||
2983 | movups (T1), %xmm4 | |
2984 | movaps STATE, IN2 | |
2985 | pshufb %xmm4, STATE | |
2986 | movups STATE, (LEN) | |
2987 | ||
2988 | movups (IVP), %xmm0 | |
2989 | pshufb %xmm0, IN1 | |
2990 | pblendvb IN2, IN1 | |
2991 | movaps IN1, STATE | |
2992 | ||
2993 | pxor IV, STATE | |
2994 | call _aesni_enc1 | |
2995 | pxor IV, STATE | |
2996 | ||
2997 | movups STATE, (OUTP) | |
2998 | jmp .Lxts_enc_ret | |
86ad60a6 AB |
2999 | SYM_FUNC_END(aesni_xts_encrypt) |
3000 | ||
3001 | /* | |
3002 | * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, | |
3003 | * const u8 *src, unsigned int len, le128 *iv) | |
3004 | */ | |
3005 | SYM_FUNC_START(aesni_xts_decrypt) | |
3006 | FRAME_BEGIN | |
2481104f AB |
3007 | #ifndef __x86_64__ |
3008 | pushl IVP | |
3009 | pushl LEN | |
3010 | pushl KEYP | |
3011 | pushl KLEN | |
3012 | movl (FRAME_OFFSET+20)(%esp), KEYP # ctx | |
3013 | movl (FRAME_OFFSET+24)(%esp), OUTP # dst | |
3014 | movl (FRAME_OFFSET+28)(%esp), INP # src | |
3015 | movl (FRAME_OFFSET+32)(%esp), LEN # len | |
3016 | movl (FRAME_OFFSET+36)(%esp), IVP # iv | |
86ad60a6 | 3017 | movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK |
2481104f AB |
3018 | #else |
3019 | movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK | |
3020 | #endif | |
86ad60a6 AB |
3021 | movups (IVP), IV |
3022 | ||
3023 | mov 480(KEYP), KLEN | |
3024 | add $240, KEYP | |
c456a9cd | 3025 | |
2481104f AB |
3026 | test $15, LEN |
3027 | jz .Lxts_dec_loop4 | |
3028 | sub $16, LEN | |
3029 | ||
86ad60a6 | 3030 | .Lxts_dec_loop4: |
2481104f AB |
3031 | sub $64, LEN |
3032 | jl .Lxts_dec_1x | |
3033 | ||
86ad60a6 | 3034 | movdqa IV, STATE1 |
2481104f AB |
3035 | movdqu 0x00(INP), IN |
3036 | pxor IN, STATE1 | |
86ad60a6 | 3037 | movdqu IV, 0x00(OUTP) |
c456a9cd | 3038 | |
86ad60a6 AB |
3039 | _aesni_gf128mul_x_ble() |
3040 | movdqa IV, STATE2 | |
2481104f AB |
3041 | movdqu 0x10(INP), IN |
3042 | pxor IN, STATE2 | |
86ad60a6 AB |
3043 | movdqu IV, 0x10(OUTP) |
3044 | ||
3045 | _aesni_gf128mul_x_ble() | |
3046 | movdqa IV, STATE3 | |
2481104f AB |
3047 | movdqu 0x20(INP), IN |
3048 | pxor IN, STATE3 | |
86ad60a6 AB |
3049 | movdqu IV, 0x20(OUTP) |
3050 | ||
3051 | _aesni_gf128mul_x_ble() | |
3052 | movdqa IV, STATE4 | |
2481104f AB |
3053 | movdqu 0x30(INP), IN |
3054 | pxor IN, STATE4 | |
86ad60a6 AB |
3055 | movdqu IV, 0x30(OUTP) |
3056 | ||
3057 | call _aesni_dec4 | |
3058 | ||
2481104f AB |
3059 | movdqu 0x00(OUTP), IN |
3060 | pxor IN, STATE1 | |
86ad60a6 AB |
3061 | movdqu STATE1, 0x00(OUTP) |
3062 | ||
2481104f AB |
3063 | movdqu 0x10(OUTP), IN |
3064 | pxor IN, STATE2 | |
86ad60a6 | 3065 | movdqu STATE2, 0x10(OUTP) |
c456a9cd | 3066 | |
2481104f AB |
3067 | movdqu 0x20(OUTP), IN |
3068 | pxor IN, STATE3 | |
86ad60a6 | 3069 | movdqu STATE3, 0x20(OUTP) |
c456a9cd | 3070 | |
2481104f AB |
3071 | movdqu 0x30(OUTP), IN |
3072 | pxor IN, STATE4 | |
86ad60a6 AB |
3073 | movdqu STATE4, 0x30(OUTP) |
3074 | ||
3075 | _aesni_gf128mul_x_ble() | |
3076 | ||
3077 | add $64, INP | |
3078 | add $64, OUTP | |
2481104f AB |
3079 | test LEN, LEN |
3080 | jnz .Lxts_dec_loop4 | |
86ad60a6 | 3081 | |
2481104f | 3082 | .Lxts_dec_ret_iv: |
86ad60a6 | 3083 | movups IV, (IVP) |
c456a9cd | 3084 | |
2481104f AB |
3085 | .Lxts_dec_ret: |
3086 | #ifndef __x86_64__ | |
3087 | popl KLEN | |
3088 | popl KEYP | |
3089 | popl LEN | |
3090 | popl IVP | |
3091 | #endif | |
8691ccd7 | 3092 | FRAME_END |
f94909ce | 3093 | RET |
c456a9cd | 3094 | |
2481104f AB |
3095 | .Lxts_dec_1x: |
3096 | add $64, LEN | |
3097 | jz .Lxts_dec_ret_iv | |
3098 | ||
3099 | .Lxts_dec_loop1: | |
3100 | movdqu (INP), STATE | |
3101 | ||
3102 | add $16, INP | |
3103 | sub $16, LEN | |
3104 | jl .Lxts_dec_cts1 | |
3105 | ||
3106 | pxor IV, STATE | |
3107 | call _aesni_dec1 | |
3108 | pxor IV, STATE | |
3109 | _aesni_gf128mul_x_ble() | |
3110 | ||
3111 | test LEN, LEN | |
3112 | jz .Lxts_dec_out | |
3113 | ||
3114 | movdqu STATE, (OUTP) | |
3115 | add $16, OUTP | |
3116 | jmp .Lxts_dec_loop1 | |
3117 | ||
3118 | .Lxts_dec_out: | |
3119 | movdqu STATE, (OUTP) | |
3120 | jmp .Lxts_dec_ret_iv | |
3121 | ||
3122 | .Lxts_dec_cts1: | |
3123 | movdqa IV, STATE4 | |
3124 | _aesni_gf128mul_x_ble() | |
3125 | ||
3126 | pxor IV, STATE | |
3127 | call _aesni_dec1 | |
3128 | pxor IV, STATE | |
3129 | ||
3130 | #ifndef __x86_64__ | |
3131 | lea .Lcts_permute_table, T1 | |
3132 | #else | |
3133 | lea .Lcts_permute_table(%rip), T1 | |
0d258efb | 3134 | #endif |
2481104f AB |
3135 | add LEN, INP /* rewind input pointer */ |
3136 | add $16, LEN /* # bytes in final block */ | |
3137 | movups (INP), IN1 | |
3138 | ||
3139 | mov T1, IVP | |
3140 | add $32, IVP | |
3141 | add LEN, T1 | |
3142 | sub LEN, IVP | |
3143 | add OUTP, LEN | |
3144 | ||
3145 | movups (T1), %xmm4 | |
3146 | movaps STATE, IN2 | |
3147 | pshufb %xmm4, STATE | |
3148 | movups STATE, (LEN) | |
3149 | ||
3150 | movups (IVP), %xmm0 | |
3151 | pshufb %xmm0, IN1 | |
3152 | pblendvb IN2, IN1 | |
3153 | movaps IN1, STATE | |
3154 | ||
3155 | pxor STATE4, STATE | |
3156 | call _aesni_dec1 | |
3157 | pxor STATE4, STATE | |
3158 | ||
3159 | movups STATE, (OUTP) | |
3160 | jmp .Lxts_dec_ret | |
3161 | SYM_FUNC_END(aesni_xts_decrypt) |