Merge tag 'vfs-6.8.netfs' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs
[linux-2.6-block.git] / arch / x86 / crypto / aesni-intel_asm.S
CommitLineData
2874c5fd 1/* SPDX-License-Identifier: GPL-2.0-or-later */
54b6a1bd
HY
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
11 * Kahraman Akdemir
12 *
0bd82f5f
TS
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 * Aidan O'Mahony (aidan.o.mahony@intel.com)
17 * Adrian Hoban <adrian.hoban@intel.com>
18 * James Guilford (james.guilford@intel.com)
19 * Gabriele Paoloni <gabriele.paoloni@intel.com>
20 * Tadeusz Struk (tadeusz.struk@intel.com)
21 * Wajdi Feghali (wajdi.k.feghali@intel.com)
22 * Copyright (c) 2010, Intel Corporation.
23 *
0d258efb
MK
24 * Ported x86_64 version to x86:
25 * Author: Mathias Krause <minipli@googlemail.com>
54b6a1bd
HY
26 */
27
28#include <linux/linkage.h>
8691ccd7 29#include <asm/frame.h>
9697fa39 30#include <asm/nospec-branch.h>
54b6a1bd 31
e31ac32d
TM
32/*
33 * The following macros are used to move an (un)aligned 16 byte value to/from
34 * an XMM register. This can done for either FP or integer values, for FP use
35 * movaps (move aligned packed single) or integer use movdqa (move double quad
36 * aligned). It doesn't make a performance difference which instruction is used
37 * since Nehalem (original Core i7) was released. However, the movaps is a byte
38 * shorter, so that is the one we'll use for now. (same for unaligned).
39 */
40#define MOVADQ movaps
41#define MOVUDQ movups
42
559ad0ff 43#ifdef __x86_64__
e31ac32d 44
e183914a 45# constants in mergeable sections, linker can reorder and merge
e183914a
DV
46.section .rodata.cst16.POLY, "aM", @progbits, 16
47.align 16
0bd82f5f 48POLY: .octa 0xC2000000000000000000000000000001
e183914a
DV
49.section .rodata.cst16.TWOONE, "aM", @progbits, 16
50.align 16
0bd82f5f
TS
51TWOONE: .octa 0x00000001000000000000000000000001
52
e183914a
DV
53.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
54.align 16
0bd82f5f 55SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
e183914a
DV
56.section .rodata.cst16.MASK1, "aM", @progbits, 16
57.align 16
0bd82f5f 58MASK1: .octa 0x0000000000000000ffffffffffffffff
e183914a
DV
59.section .rodata.cst16.MASK2, "aM", @progbits, 16
60.align 16
0bd82f5f 61MASK2: .octa 0xffffffffffffffff0000000000000000
e183914a
DV
62.section .rodata.cst16.ONE, "aM", @progbits, 16
63.align 16
0bd82f5f 64ONE: .octa 0x00000000000000000000000000000001
e183914a
DV
65.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
66.align 16
0bd82f5f 67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
e183914a
DV
68.section .rodata.cst16.dec, "aM", @progbits, 16
69.align 16
0bd82f5f 70dec: .octa 0x1
e183914a
DV
71.section .rodata.cst16.enc, "aM", @progbits, 16
72.align 16
0bd82f5f
TS
73enc: .octa 0x2
74
e183914a
DV
75# order of these constants should not change.
76# more specifically, ALL_F should follow SHIFT_MASK,
77# and zero should follow ALL_F
78.section .rodata, "a", @progbits
79.align 16
80SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
81ALL_F: .octa 0xffffffffffffffffffffffffffffffff
82 .octa 0x00000000000000000000000000000000
83
54b6a1bd
HY
84.text
85
0bd82f5f
TS
86
87#define STACK_OFFSET 8*3
0bd82f5f 88
9ee4a5df
DW
89#define AadHash 16*0
90#define AadLen 16*1
91#define InLen (16*1)+8
92#define PBlockEncKey 16*2
93#define OrigIV 16*3
94#define CurCount 16*4
95#define PBlockLen 16*5
1476db2d
DW
96#define HashKey 16*6 // store HashKey <<1 mod poly here
97#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
98#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
99#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
100#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
101 // bits of HashKey <<1 mod poly here
102 //(for Karatsuba purposes)
103#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
104 // bits of HashKey^2 <<1 mod poly here
105 // (for Karatsuba purposes)
106#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
107 // bits of HashKey^3 <<1 mod poly here
108 // (for Karatsuba purposes)
109#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
110 // bits of HashKey^4 <<1 mod poly here
111 // (for Karatsuba purposes)
9ee4a5df 112
0bd82f5f
TS
113#define arg1 rdi
114#define arg2 rsi
115#define arg3 rdx
116#define arg4 rcx
117#define arg5 r8
118#define arg6 r9
1476db2d
DW
119#define arg7 STACK_OFFSET+8(%rsp)
120#define arg8 STACK_OFFSET+16(%rsp)
121#define arg9 STACK_OFFSET+24(%rsp)
122#define arg10 STACK_OFFSET+32(%rsp)
123#define arg11 STACK_OFFSET+40(%rsp)
e31ac32d 124#define keysize 2*15*16(%arg1)
559ad0ff 125#endif
0bd82f5f
TS
126
127
54b6a1bd
HY
128#define STATE1 %xmm0
129#define STATE2 %xmm4
130#define STATE3 %xmm5
131#define STATE4 %xmm6
132#define STATE STATE1
133#define IN1 %xmm1
134#define IN2 %xmm7
135#define IN3 %xmm8
136#define IN4 %xmm9
137#define IN IN1
138#define KEY %xmm2
139#define IV %xmm3
0d258efb 140
12387a46
HY
141#define BSWAP_MASK %xmm10
142#define CTR %xmm11
143#define INC %xmm12
54b6a1bd 144
2481104f 145#define GF128MUL_MASK %xmm7
c456a9cd 146
0d258efb
MK
147#ifdef __x86_64__
148#define AREG %rax
54b6a1bd
HY
149#define KEYP %rdi
150#define OUTP %rsi
0d258efb 151#define UKEYP OUTP
54b6a1bd
HY
152#define INP %rdx
153#define LEN %rcx
154#define IVP %r8
155#define KLEN %r9d
156#define T1 %r10
157#define TKEYP T1
158#define T2 %r11
12387a46 159#define TCTR_LOW T2
0d258efb
MK
160#else
161#define AREG %eax
162#define KEYP %edi
163#define OUTP AREG
164#define UKEYP OUTP
165#define INP %edx
166#define LEN %esi
167#define IVP %ebp
168#define KLEN %ebx
169#define T1 %ecx
170#define TKEYP T1
171#endif
54b6a1bd 172
6c2c86b3
DW
173.macro FUNC_SAVE
174 push %r12
175 push %r13
176 push %r14
6c2c86b3
DW
177#
178# states of %xmm registers %xmm6:%xmm15 not saved
179# all %xmm registers are clobbered
180#
6c2c86b3
DW
181.endm
182
183
184.macro FUNC_RESTORE
6c2c86b3
DW
185 pop %r14
186 pop %r13
187 pop %r12
188.endm
0bd82f5f 189
1476db2d
DW
190# Precompute hashkeys.
191# Input: Hash subkey.
192# Output: HashKeys stored in gcm_context_data. Only needs to be called
193# once per key.
194# clobbers r12, and tmp xmm registers.
fb8986e6
DW
195.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
196 mov \SUBKEY, %r12
1476db2d
DW
197 movdqu (%r12), \TMP3
198 movdqa SHUF_MASK(%rip), \TMP2
d7866e50 199 pshufb \TMP2, \TMP3
1476db2d
DW
200
201 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
202
203 movdqa \TMP3, \TMP2
204 psllq $1, \TMP3
205 psrlq $63, \TMP2
206 movdqa \TMP2, \TMP1
207 pslldq $8, \TMP2
208 psrldq $8, \TMP1
209 por \TMP2, \TMP3
210
211 # reduce HashKey<<1
212
213 pshufd $0x24, \TMP1, \TMP2
214 pcmpeqd TWOONE(%rip), \TMP2
215 pand POLY(%rip), \TMP2
216 pxor \TMP2, \TMP3
e5b954e8 217 movdqu \TMP3, HashKey(%arg2)
1476db2d
DW
218
219 movdqa \TMP3, \TMP5
220 pshufd $78, \TMP3, \TMP1
221 pxor \TMP3, \TMP1
e5b954e8 222 movdqu \TMP1, HashKey_k(%arg2)
1476db2d
DW
223
224 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
225# TMP5 = HashKey^2<<1 (mod poly)
e5b954e8 226 movdqu \TMP5, HashKey_2(%arg2)
1476db2d
DW
227# HashKey_2 = HashKey^2<<1 (mod poly)
228 pshufd $78, \TMP5, \TMP1
229 pxor \TMP5, \TMP1
e5b954e8 230 movdqu \TMP1, HashKey_2_k(%arg2)
1476db2d
DW
231
232 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
233# TMP5 = HashKey^3<<1 (mod poly)
e5b954e8 234 movdqu \TMP5, HashKey_3(%arg2)
1476db2d
DW
235 pshufd $78, \TMP5, \TMP1
236 pxor \TMP5, \TMP1
e5b954e8 237 movdqu \TMP1, HashKey_3_k(%arg2)
1476db2d
DW
238
239 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
240# TMP5 = HashKey^3<<1 (mod poly)
e5b954e8 241 movdqu \TMP5, HashKey_4(%arg2)
1476db2d
DW
242 pshufd $78, \TMP5, \TMP1
243 pxor \TMP5, \TMP1
e5b954e8 244 movdqu \TMP1, HashKey_4_k(%arg2)
1476db2d 245.endm
7af964c2
DW
246
247# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
248# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
fb8986e6
DW
249.macro GCM_INIT Iv SUBKEY AAD AADLEN
250 mov \AADLEN, %r11
9660474b 251 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
a7bea830 252 xor %r11d, %r11d
9660474b
DW
253 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
254 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
255 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
fb8986e6 256 mov \Iv, %rax
9660474b
DW
257 movdqu (%rax), %xmm0
258 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
259
260 movdqa SHUF_MASK(%rip), %xmm2
d7866e50 261 pshufb %xmm2, %xmm0
9660474b
DW
262 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
263
3347c8a0 264 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
e5b954e8 265 movdqu HashKey(%arg2), %xmm13
c594c540 266
fb8986e6
DW
267 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
268 %xmm4, %xmm5, %xmm6
7af964c2
DW
269.endm
270
ba45833e
DW
271# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
272# struct has been initialized by GCM_INIT.
273# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
274# Clobbers rax, r10-r13, and xmm0-xmm15
275.macro GCM_ENC_DEC operation
9660474b 276 movdqu AadHash(%arg2), %xmm8
1476db2d 277 movdqu HashKey(%arg2), %xmm13
9660474b 278 add %arg5, InLen(%arg2)
ae952c5e 279
a7bea830 280 xor %r11d, %r11d # initialise the data pointer offset as zero
ae952c5e
DW
281 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
282
283 sub %r11, %arg5 # sub partial block data used
9660474b 284 mov %arg5, %r13 # save the number of bytes
ae952c5e 285
9660474b
DW
286 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
287 mov %r13, %r12
ba45833e
DW
288 # Encrypt/Decrypt first few blocks
289
290 and $(3<<4), %r12
1d4b0ff3 291 jz .L_initial_num_blocks_is_0_\@
ba45833e 292 cmp $(2<<4), %r12
1d4b0ff3
AB
293 jb .L_initial_num_blocks_is_1_\@
294 je .L_initial_num_blocks_is_2_\@
295.L_initial_num_blocks_is_3_\@:
ba45833e
DW
296 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
297%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
298 sub $48, %r13
1d4b0ff3
AB
299 jmp .L_initial_blocks_\@
300.L_initial_num_blocks_is_2_\@:
ba45833e
DW
301 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
303 sub $32, %r13
1d4b0ff3
AB
304 jmp .L_initial_blocks_\@
305.L_initial_num_blocks_is_1_\@:
ba45833e
DW
306 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
308 sub $16, %r13
1d4b0ff3
AB
309 jmp .L_initial_blocks_\@
310.L_initial_num_blocks_is_0_\@:
ba45833e
DW
311 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
1d4b0ff3 313.L_initial_blocks_\@:
ba45833e
DW
314
315 # Main loop - Encrypt/Decrypt remaining blocks
316
032d049e 317 test %r13, %r13
1d4b0ff3 318 je .L_zero_cipher_left_\@
ba45833e 319 sub $64, %r13
1d4b0ff3
AB
320 je .L_four_cipher_left_\@
321.L_crypt_by_4_\@:
ba45833e
DW
322 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
323 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
324 %xmm7, %xmm8, enc
325 add $64, %r11
326 sub $64, %r13
1d4b0ff3
AB
327 jne .L_crypt_by_4_\@
328.L_four_cipher_left_\@:
ba45833e
DW
329 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
330%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1d4b0ff3 331.L_zero_cipher_left_\@:
9660474b
DW
332 movdqu %xmm8, AadHash(%arg2)
333 movdqu %xmm0, CurCount(%arg2)
334
9ee4a5df
DW
335 mov %arg5, %r13
336 and $15, %r13 # %r13 = arg5 (mod 16)
1d4b0ff3 337 je .L_multiple_of_16_bytes_\@
ba45833e 338
9660474b
DW
339 mov %r13, PBlockLen(%arg2)
340
ba45833e
DW
341 # Handle the last <16 Byte block separately
342 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
9660474b 343 movdqu %xmm0, CurCount(%arg2)
9ee4a5df 344 movdqa SHUF_MASK(%rip), %xmm10
d7866e50 345 pshufb %xmm10, %xmm0
ba45833e
DW
346
347 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
9660474b 348 movdqu %xmm0, PBlockEncKey(%arg2)
ba45833e 349
933d6aef 350 cmp $16, %arg5
1d4b0ff3 351 jge .L_large_enough_update_\@
933d6aef 352
9ee4a5df 353 lea (%arg4,%r11,1), %r10
ba45833e
DW
354 mov %r13, %r12
355 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1d4b0ff3 356 jmp .L_data_read_\@
933d6aef 357
1d4b0ff3 358.L_large_enough_update_\@:
933d6aef
DW
359 sub $16, %r11
360 add %r13, %r11
361
362 # receive the last <16 Byte block
363 movdqu (%arg4, %r11, 1), %xmm1
ba45833e 364
933d6aef
DW
365 sub %r13, %r11
366 add $16, %r11
367
368 lea SHIFT_MASK+16(%rip), %r12
369 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
370 # (r13 is the number of bytes in plaintext mod 16)
371 sub %r13, %r12
372 # get the appropriate shuffle mask
373 movdqu (%r12), %xmm2
374 # shift right 16-r13 bytes
d7866e50 375 pshufb %xmm2, %xmm1
933d6aef 376
1d4b0ff3 377.L_data_read_\@:
ba45833e
DW
378 lea ALL_F+16(%rip), %r12
379 sub %r13, %r12
933d6aef 380
ba45833e
DW
381.ifc \operation, dec
382 movdqa %xmm1, %xmm2
383.endif
384 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
385 movdqu (%r12), %xmm1
386 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
387 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
388.ifc \operation, dec
389 pand %xmm1, %xmm2
390 movdqa SHUF_MASK(%rip), %xmm10
d7866e50 391 pshufb %xmm10 ,%xmm2
ba45833e
DW
392
393 pxor %xmm2, %xmm8
394.else
395 movdqa SHUF_MASK(%rip), %xmm10
d7866e50 396 pshufb %xmm10,%xmm0
ba45833e
DW
397
398 pxor %xmm0, %xmm8
399.endif
400
9660474b 401 movdqu %xmm8, AadHash(%arg2)
ba45833e
DW
402.ifc \operation, enc
403 # GHASH computation for the last <16 byte block
404 movdqa SHUF_MASK(%rip), %xmm10
405 # shuffle xmm0 back to output as ciphertext
d7866e50 406 pshufb %xmm10, %xmm0
ba45833e
DW
407.endif
408
409 # Output %r13 bytes
d7866e50 410 movq %xmm0, %rax
ba45833e 411 cmp $8, %r13
1d4b0ff3 412 jle .L_less_than_8_bytes_left_\@
9ee4a5df 413 mov %rax, (%arg3 , %r11, 1)
ba45833e
DW
414 add $8, %r11
415 psrldq $8, %xmm0
d7866e50 416 movq %xmm0, %rax
ba45833e 417 sub $8, %r13
1d4b0ff3 418.L_less_than_8_bytes_left_\@:
9ee4a5df 419 mov %al, (%arg3, %r11, 1)
ba45833e
DW
420 add $1, %r11
421 shr $8, %rax
422 sub $1, %r13
1d4b0ff3
AB
423 jne .L_less_than_8_bytes_left_\@
424.L_multiple_of_16_bytes_\@:
ba45833e
DW
425.endm
426
adcadab3
DW
427# GCM_COMPLETE Finishes update of tag of last partial block
428# Output: Authorization Tag (AUTH_TAG)
429# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
fb8986e6 430.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
9660474b 431 movdqu AadHash(%arg2), %xmm8
1476db2d 432 movdqu HashKey(%arg2), %xmm13
e2e34b08
DW
433
434 mov PBlockLen(%arg2), %r12
435
032d049e 436 test %r12, %r12
1d4b0ff3 437 je .L_partial_done\@
e2e34b08
DW
438
439 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
440
1d4b0ff3 441.L_partial_done\@:
9660474b 442 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
adcadab3
DW
443 shl $3, %r12 # convert into number of bits
444 movd %r12d, %xmm15 # len(A) in %xmm15
9660474b
DW
445 mov InLen(%arg2), %r12
446 shl $3, %r12 # len(C) in bits (*128)
d7866e50 447 movq %r12, %xmm1
9660474b 448
adcadab3
DW
449 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
450 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
451 pxor %xmm15, %xmm8
452 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
453 # final GHASH computation
454 movdqa SHUF_MASK(%rip), %xmm10
d7866e50 455 pshufb %xmm10, %xmm8
adcadab3 456
9660474b 457 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
adcadab3
DW
458 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
459 pxor %xmm8, %xmm0
1d4b0ff3 460.L_return_T_\@:
fb8986e6
DW
461 mov \AUTHTAG, %r10 # %r10 = authTag
462 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
adcadab3 463 cmp $16, %r11
1d4b0ff3 464 je .L_T_16_\@
adcadab3 465 cmp $8, %r11
1d4b0ff3
AB
466 jl .L_T_4_\@
467.L_T_8_\@:
d7866e50 468 movq %xmm0, %rax
adcadab3
DW
469 mov %rax, (%r10)
470 add $8, %r10
471 sub $8, %r11
472 psrldq $8, %xmm0
032d049e 473 test %r11, %r11
1d4b0ff3
AB
474 je .L_return_T_done_\@
475.L_T_4_\@:
adcadab3
DW
476 movd %xmm0, %eax
477 mov %eax, (%r10)
478 add $4, %r10
479 sub $4, %r11
480 psrldq $4, %xmm0
032d049e 481 test %r11, %r11
1d4b0ff3
AB
482 je .L_return_T_done_\@
483.L_T_123_\@:
adcadab3
DW
484 movd %xmm0, %eax
485 cmp $2, %r11
1d4b0ff3 486 jl .L_T_1_\@
adcadab3
DW
487 mov %ax, (%r10)
488 cmp $2, %r11
1d4b0ff3 489 je .L_return_T_done_\@
adcadab3
DW
490 add $2, %r10
491 sar $16, %eax
1d4b0ff3 492.L_T_1_\@:
adcadab3 493 mov %al, (%r10)
1d4b0ff3
AB
494 jmp .L_return_T_done_\@
495.L_T_16_\@:
adcadab3 496 movdqu %xmm0, (%r10)
1d4b0ff3 497.L_return_T_done_\@:
adcadab3
DW
498.endm
499
559ad0ff 500#ifdef __x86_64__
0bd82f5f
TS
501/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
502*
503*
504* Input: A and B (128-bits each, bit-reflected)
505* Output: C = A*B*x mod poly, (i.e. >>1 )
506* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
507* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
508*
509*/
510.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
511 movdqa \GH, \TMP1
512 pshufd $78, \GH, \TMP2
513 pshufd $78, \HK, \TMP3
514 pxor \GH, \TMP2 # TMP2 = a1+a0
515 pxor \HK, \TMP3 # TMP3 = b1+b0
d7866e50
UB
516 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
517 pclmulqdq $0x00, \HK, \GH # GH = a0*b0
518 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
0bd82f5f
TS
519 pxor \GH, \TMP2
520 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
521 movdqa \TMP2, \TMP3
522 pslldq $8, \TMP3 # left shift TMP3 2 DWs
523 psrldq $8, \TMP2 # right shift TMP2 2 DWs
524 pxor \TMP3, \GH
525 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
526
527 # first phase of the reduction
528
529 movdqa \GH, \TMP2
530 movdqa \GH, \TMP3
531 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
532 # in in order to perform
533 # independent shifts
534 pslld $31, \TMP2 # packed right shift <<31
535 pslld $30, \TMP3 # packed right shift <<30
536 pslld $25, \TMP4 # packed right shift <<25
537 pxor \TMP3, \TMP2 # xor the shifted versions
538 pxor \TMP4, \TMP2
539 movdqa \TMP2, \TMP5
540 psrldq $4, \TMP5 # right shift TMP5 1 DW
541 pslldq $12, \TMP2 # left shift TMP2 3 DWs
542 pxor \TMP2, \GH
543
544 # second phase of the reduction
545
546 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
547 # in in order to perform
548 # independent shifts
549 movdqa \GH,\TMP3
550 movdqa \GH,\TMP4
551 psrld $1,\TMP2 # packed left shift >>1
552 psrld $2,\TMP3 # packed left shift >>2
553 psrld $7,\TMP4 # packed left shift >>7
554 pxor \TMP3,\TMP2 # xor the shifted versions
555 pxor \TMP4,\TMP2
556 pxor \TMP5, \TMP2
557 pxor \TMP2, \GH
558 pxor \TMP1, \GH # result is in TMP1
559.endm
560
b20209c9
JS
561# Reads DLEN bytes starting at DPTR and stores in XMMDst
562# where 0 < DLEN < 16
563# Clobbers %rax, DLEN and XMM1
564.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
565 cmp $8, \DLEN
1d4b0ff3 566 jl .L_read_lt8_\@
b20209c9 567 mov (\DPTR), %rax
d7866e50 568 movq %rax, \XMMDst
b20209c9 569 sub $8, \DLEN
1d4b0ff3 570 jz .L_done_read_partial_block_\@
b20209c9 571 xor %eax, %eax
1d4b0ff3 572.L_read_next_byte_\@:
b20209c9
JS
573 shl $8, %rax
574 mov 7(\DPTR, \DLEN, 1), %al
575 dec \DLEN
1d4b0ff3 576 jnz .L_read_next_byte_\@
d7866e50 577 movq %rax, \XMM1
b20209c9
JS
578 pslldq $8, \XMM1
579 por \XMM1, \XMMDst
1d4b0ff3
AB
580 jmp .L_done_read_partial_block_\@
581.L_read_lt8_\@:
b20209c9 582 xor %eax, %eax
1d4b0ff3 583.L_read_next_byte_lt8_\@:
b20209c9
JS
584 shl $8, %rax
585 mov -1(\DPTR, \DLEN, 1), %al
586 dec \DLEN
1d4b0ff3 587 jnz .L_read_next_byte_lt8_\@
d7866e50 588 movq %rax, \XMMDst
1d4b0ff3 589.L_done_read_partial_block_\@:
b20209c9
JS
590.endm
591
c594c540
DW
592# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
593# clobbers r10-11, xmm14
fb8986e6 594.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
c594c540
DW
595 TMP6 TMP7
596 MOVADQ SHUF_MASK(%rip), %xmm14
fb8986e6
DW
597 mov \AAD, %r10 # %r10 = AAD
598 mov \AADLEN, %r11 # %r11 = aadLen
c594c540
DW
599 pxor \TMP7, \TMP7
600 pxor \TMP6, \TMP6
0487ccac
SD
601
602 cmp $16, %r11
1d4b0ff3
AB
603 jl .L_get_AAD_rest\@
604.L_get_AAD_blocks\@:
c594c540 605 movdqu (%r10), \TMP7
d7866e50 606 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
c594c540
DW
607 pxor \TMP7, \TMP6
608 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
0487ccac 609 add $16, %r10
0487ccac
SD
610 sub $16, %r11
611 cmp $16, %r11
1d4b0ff3 612 jge .L_get_AAD_blocks\@
0487ccac 613
c594c540 614 movdqu \TMP6, \TMP7
1ecdd37e
JS
615
616 /* read the last <16B of AAD */
1d4b0ff3 617.L_get_AAD_rest\@:
032d049e 618 test %r11, %r11
1d4b0ff3 619 je .L_get_AAD_done\@
0487ccac 620
c594c540 621 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
d7866e50 622 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
c594c540
DW
623 pxor \TMP6, \TMP7
624 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
625 movdqu \TMP7, \TMP6
3c097b80 626
1d4b0ff3 627.L_get_AAD_done\@:
c594c540
DW
628 movdqu \TMP6, AadHash(%arg2)
629.endm
630
ae952c5e
DW
631# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
632# between update calls.
633# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
634# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
635# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
636.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
637 AAD_HASH operation
638 mov PBlockLen(%arg2), %r13
032d049e 639 test %r13, %r13
1d4b0ff3 640 je .L_partial_block_done_\@ # Leave Macro if no partial blocks
ae952c5e
DW
641 # Read in input data without over reading
642 cmp $16, \PLAIN_CYPH_LEN
1d4b0ff3 643 jl .L_fewer_than_16_bytes_\@
ae952c5e 644 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
1d4b0ff3 645 jmp .L_data_read_\@
ae952c5e 646
1d4b0ff3 647.L_fewer_than_16_bytes_\@:
ae952c5e
DW
648 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
649 mov \PLAIN_CYPH_LEN, %r12
650 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
651
652 mov PBlockLen(%arg2), %r13
653
1d4b0ff3 654.L_data_read_\@: # Finished reading in data
ae952c5e
DW
655
656 movdqu PBlockEncKey(%arg2), %xmm9
657 movdqu HashKey(%arg2), %xmm13
658
659 lea SHIFT_MASK(%rip), %r12
660
661 # adjust the shuffle mask pointer to be able to shift r13 bytes
662 # r16-r13 is the number of bytes in plaintext mod 16)
663 add %r13, %r12
664 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
d7866e50 665 pshufb %xmm2, %xmm9 # shift right r13 bytes
ae952c5e
DW
666
667.ifc \operation, dec
668 movdqa %xmm1, %xmm3
54aa699e 669 pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn)
ae952c5e
DW
670
671 mov \PLAIN_CYPH_LEN, %r10
672 add %r13, %r10
673 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
674 sub $16, %r10
02968703 675 # Determine if partial block is not being filled and
ae952c5e 676 # shift mask accordingly
1d4b0ff3 677 jge .L_no_extra_mask_1_\@
ae952c5e 678 sub %r10, %r12
1d4b0ff3 679.L_no_extra_mask_1_\@:
ae952c5e
DW
680
681 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
682 # get the appropriate mask to mask out bottom r13 bytes of xmm9
683 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
684
685 pand %xmm1, %xmm3
686 movdqa SHUF_MASK(%rip), %xmm10
d7866e50
UB
687 pshufb %xmm10, %xmm3
688 pshufb %xmm2, %xmm3
ae952c5e
DW
689 pxor %xmm3, \AAD_HASH
690
032d049e 691 test %r10, %r10
1d4b0ff3 692 jl .L_partial_incomplete_1_\@
ae952c5e
DW
693
694 # GHASH computation for the last <16 Byte block
695 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
a7bea830 696 xor %eax, %eax
ae952c5e
DW
697
698 mov %rax, PBlockLen(%arg2)
1d4b0ff3
AB
699 jmp .L_dec_done_\@
700.L_partial_incomplete_1_\@:
ae952c5e 701 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
1d4b0ff3 702.L_dec_done_\@:
ae952c5e
DW
703 movdqu \AAD_HASH, AadHash(%arg2)
704.else
705 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
706
707 mov \PLAIN_CYPH_LEN, %r10
708 add %r13, %r10
709 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
710 sub $16, %r10
02968703 711 # Determine if partial block is not being filled and
ae952c5e 712 # shift mask accordingly
1d4b0ff3 713 jge .L_no_extra_mask_2_\@
ae952c5e 714 sub %r10, %r12
1d4b0ff3 715.L_no_extra_mask_2_\@:
ae952c5e
DW
716
717 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
718 # get the appropriate mask to mask out bottom r13 bytes of xmm9
719 pand %xmm1, %xmm9
720
721 movdqa SHUF_MASK(%rip), %xmm1
d7866e50
UB
722 pshufb %xmm1, %xmm9
723 pshufb %xmm2, %xmm9
ae952c5e
DW
724 pxor %xmm9, \AAD_HASH
725
032d049e 726 test %r10, %r10
1d4b0ff3 727 jl .L_partial_incomplete_2_\@
ae952c5e
DW
728
729 # GHASH computation for the last <16 Byte block
730 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
a7bea830 731 xor %eax, %eax
ae952c5e
DW
732
733 mov %rax, PBlockLen(%arg2)
1d4b0ff3
AB
734 jmp .L_encode_done_\@
735.L_partial_incomplete_2_\@:
ae952c5e 736 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
1d4b0ff3 737.L_encode_done_\@:
ae952c5e
DW
738 movdqu \AAD_HASH, AadHash(%arg2)
739
740 movdqa SHUF_MASK(%rip), %xmm10
741 # shuffle xmm9 back to output as ciphertext
d7866e50
UB
742 pshufb %xmm10, %xmm9
743 pshufb %xmm2, %xmm9
ae952c5e
DW
744.endif
745 # output encrypted Bytes
032d049e 746 test %r10, %r10
1d4b0ff3 747 jl .L_partial_fill_\@
ae952c5e
DW
748 mov %r13, %r12
749 mov $16, %r13
750 # Set r13 to be the number of bytes to write out
751 sub %r12, %r13
1d4b0ff3
AB
752 jmp .L_count_set_\@
753.L_partial_fill_\@:
ae952c5e 754 mov \PLAIN_CYPH_LEN, %r13
1d4b0ff3 755.L_count_set_\@:
ae952c5e 756 movdqa %xmm9, %xmm0
d7866e50 757 movq %xmm0, %rax
ae952c5e 758 cmp $8, %r13
1d4b0ff3 759 jle .L_less_than_8_bytes_left_\@
ae952c5e
DW
760
761 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
762 add $8, \DATA_OFFSET
763 psrldq $8, %xmm0
d7866e50 764 movq %xmm0, %rax
ae952c5e 765 sub $8, %r13
1d4b0ff3 766.L_less_than_8_bytes_left_\@:
ae952c5e
DW
767 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
768 add $1, \DATA_OFFSET
769 shr $8, %rax
770 sub $1, %r13
1d4b0ff3
AB
771 jne .L_less_than_8_bytes_left_\@
772.L_partial_block_done_\@:
ae952c5e
DW
773.endm # PARTIAL_BLOCK
774
c594c540
DW
775/*
776* if a = number of total plaintext bytes
777* b = floor(a/16)
778* num_initial_blocks = b mod 4
779* encrypt the initial num_initial_blocks blocks and apply ghash on
780* the ciphertext
781* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
782* are clobbered
1476db2d 783* arg1, %arg2, %arg3 are used as a pointer only, not modified
c594c540
DW
784*/
785
786
787.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
788 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
9660474b 789 MOVADQ SHUF_MASK(%rip), %xmm14
c594c540
DW
790
791 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
792
0487ccac 793 # start AES for num_initial_blocks blocks
3c097b80 794
9660474b 795 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
3c097b80
TS
796
797.if (\i == 5) || (\i == 6) || (\i == 7)
3c097b80 798
e31ac32d
TM
799 MOVADQ ONE(%RIP),\TMP1
800 MOVADQ 0(%arg1),\TMP2
3c097b80 801.irpc index, \i_seq
e31ac32d 802 paddd \TMP1, \XMM0 # INCR Y0
e1fd316f
DW
803.ifc \operation, dec
804 movdqa \XMM0, %xmm\index
805.else
e31ac32d 806 MOVADQ \XMM0, %xmm\index
e1fd316f 807.endif
d7866e50 808 pshufb %xmm14, %xmm\index # perform a 16 byte swap
e31ac32d 809 pxor \TMP2, %xmm\index
3c097b80 810.endr
e31ac32d
TM
811 lea 0x10(%arg1),%r10
812 mov keysize,%eax
813 shr $2,%eax # 128->4, 192->6, 256->8
814 add $5,%eax # 128->9, 192->11, 256->13
815
1d4b0ff3 816.Laes_loop_initial_\@:
e31ac32d
TM
817 MOVADQ (%r10),\TMP1
818.irpc index, \i_seq
d7866e50 819 aesenc \TMP1, %xmm\index
3c097b80 820.endr
e31ac32d
TM
821 add $16,%r10
822 sub $1,%eax
1d4b0ff3 823 jnz .Laes_loop_initial_\@
e31ac32d
TM
824
825 MOVADQ (%r10), \TMP1
3c097b80 826.irpc index, \i_seq
d7866e50 827 aesenclast \TMP1, %xmm\index # Last Round
3c097b80
TS
828.endr
829.irpc index, \i_seq
9ee4a5df 830 movdqu (%arg4 , %r11, 1), \TMP1
3c097b80 831 pxor \TMP1, %xmm\index
9ee4a5df 832 movdqu %xmm\index, (%arg3 , %r11, 1)
3c097b80
TS
833 # write back plaintext/ciphertext for num_initial_blocks
834 add $16, %r11
e1fd316f
DW
835
836.ifc \operation, dec
837 movdqa \TMP1, %xmm\index
838.endif
d7866e50 839 pshufb %xmm14, %xmm\index
3c097b80
TS
840
841 # prepare plaintext/ciphertext for GHASH computation
842.endr
843.endif
0487ccac 844
3c097b80
TS
845 # apply GHASH on num_initial_blocks blocks
846
847.if \i == 5
848 pxor %xmm5, %xmm6
849 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
850 pxor %xmm6, %xmm7
851 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
852 pxor %xmm7, %xmm8
853 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
854.elseif \i == 6
855 pxor %xmm6, %xmm7
856 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
857 pxor %xmm7, %xmm8
858 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
859.elseif \i == 7
860 pxor %xmm7, %xmm8
861 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
862.endif
863 cmp $64, %r13
1d4b0ff3 864 jl .L_initial_blocks_done\@
3c097b80
TS
865 # no need for precomputed values
866/*
867*
868* Precomputations for HashKey parallel with encryption of first 4 blocks.
869* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
870*/
e31ac32d
TM
871 MOVADQ ONE(%RIP),\TMP1
872 paddd \TMP1, \XMM0 # INCR Y0
873 MOVADQ \XMM0, \XMM1
d7866e50 874 pshufb %xmm14, \XMM1 # perform a 16 byte swap
3c097b80 875
e31ac32d
TM
876 paddd \TMP1, \XMM0 # INCR Y0
877 MOVADQ \XMM0, \XMM2
d7866e50 878 pshufb %xmm14, \XMM2 # perform a 16 byte swap
3c097b80 879
e31ac32d
TM
880 paddd \TMP1, \XMM0 # INCR Y0
881 MOVADQ \XMM0, \XMM3
d7866e50 882 pshufb %xmm14, \XMM3 # perform a 16 byte swap
3c097b80 883
e31ac32d
TM
884 paddd \TMP1, \XMM0 # INCR Y0
885 MOVADQ \XMM0, \XMM4
d7866e50 886 pshufb %xmm14, \XMM4 # perform a 16 byte swap
3c097b80 887
e31ac32d
TM
888 MOVADQ 0(%arg1),\TMP1
889 pxor \TMP1, \XMM1
890 pxor \TMP1, \XMM2
891 pxor \TMP1, \XMM3
892 pxor \TMP1, \XMM4
3c097b80
TS
893.irpc index, 1234 # do 4 rounds
894 movaps 0x10*\index(%arg1), \TMP1
d7866e50
UB
895 aesenc \TMP1, \XMM1
896 aesenc \TMP1, \XMM2
897 aesenc \TMP1, \XMM3
898 aesenc \TMP1, \XMM4
3c097b80 899.endr
3c097b80
TS
900.irpc index, 56789 # do next 5 rounds
901 movaps 0x10*\index(%arg1), \TMP1
d7866e50
UB
902 aesenc \TMP1, \XMM1
903 aesenc \TMP1, \XMM2
904 aesenc \TMP1, \XMM3
905 aesenc \TMP1, \XMM4
3c097b80 906.endr
e31ac32d
TM
907 lea 0xa0(%arg1),%r10
908 mov keysize,%eax
909 shr $2,%eax # 128->4, 192->6, 256->8
910 sub $4,%eax # 128->0, 192->2, 256->4
1d4b0ff3 911 jz .Laes_loop_pre_done\@
e31ac32d 912
1d4b0ff3 913.Laes_loop_pre_\@:
e31ac32d
TM
914 MOVADQ (%r10),\TMP2
915.irpc index, 1234
d7866e50 916 aesenc \TMP2, %xmm\index
e31ac32d
TM
917.endr
918 add $16,%r10
919 sub $1,%eax
1d4b0ff3 920 jnz .Laes_loop_pre_\@
e31ac32d 921
1d4b0ff3 922.Laes_loop_pre_done\@:
e31ac32d 923 MOVADQ (%r10), \TMP2
d7866e50
UB
924 aesenclast \TMP2, \XMM1
925 aesenclast \TMP2, \XMM2
926 aesenclast \TMP2, \XMM3
927 aesenclast \TMP2, \XMM4
9ee4a5df 928 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
3c097b80 929 pxor \TMP1, \XMM1
e1fd316f 930.ifc \operation, dec
9ee4a5df 931 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
e1fd316f
DW
932 movdqa \TMP1, \XMM1
933.endif
9ee4a5df 934 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
3c097b80 935 pxor \TMP1, \XMM2
e1fd316f 936.ifc \operation, dec
9ee4a5df 937 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
e1fd316f
DW
938 movdqa \TMP1, \XMM2
939.endif
9ee4a5df 940 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
3c097b80 941 pxor \TMP1, \XMM3
e1fd316f 942.ifc \operation, dec
9ee4a5df 943 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
e1fd316f
DW
944 movdqa \TMP1, \XMM3
945.endif
9ee4a5df 946 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
3c097b80 947 pxor \TMP1, \XMM4
e1fd316f 948.ifc \operation, dec
9ee4a5df 949 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
e1fd316f
DW
950 movdqa \TMP1, \XMM4
951.else
9ee4a5df
DW
952 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
953 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
954 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
955 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
e1fd316f 956.endif
3c097b80 957
0bd82f5f 958 add $64, %r11
d7866e50 959 pshufb %xmm14, \XMM1 # perform a 16 byte swap
0bd82f5f
TS
960 pxor \XMMDst, \XMM1
961# combine GHASHed value with the corresponding ciphertext
d7866e50
UB
962 pshufb %xmm14, \XMM2 # perform a 16 byte swap
963 pshufb %xmm14, \XMM3 # perform a 16 byte swap
964 pshufb %xmm14, \XMM4 # perform a 16 byte swap
3c097b80 965
1d4b0ff3 966.L_initial_blocks_done\@:
3c097b80 967
0bd82f5f
TS
968.endm
969
970/*
971* encrypt 4 blocks at a time
972* ghash the 4 previously encrypted ciphertext blocks
9ee4a5df 973* arg1, %arg3, %arg4 are used as pointers only, not modified
0bd82f5f
TS
974* %r11 is the data offset value
975*/
3347c8a0 976.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
3c097b80
TS
977TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
978
979 movdqa \XMM1, \XMM5
980 movdqa \XMM2, \XMM6
981 movdqa \XMM3, \XMM7
982 movdqa \XMM4, \XMM8
983
984 movdqa SHUF_MASK(%rip), %xmm15
985 # multiply TMP5 * HashKey using karatsuba
986
987 movdqa \XMM5, \TMP4
988 pshufd $78, \XMM5, \TMP6
989 pxor \XMM5, \TMP6
990 paddd ONE(%rip), \XMM0 # INCR CNT
e5b954e8 991 movdqu HashKey_4(%arg2), \TMP5
d7866e50 992 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
3c097b80
TS
993 movdqa \XMM0, \XMM1
994 paddd ONE(%rip), \XMM0 # INCR CNT
995 movdqa \XMM0, \XMM2
996 paddd ONE(%rip), \XMM0 # INCR CNT
997 movdqa \XMM0, \XMM3
998 paddd ONE(%rip), \XMM0 # INCR CNT
999 movdqa \XMM0, \XMM4
d7866e50
UB
1000 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1001 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1002 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1003 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1004 pshufb %xmm15, \XMM4 # perform a 16 byte swap
3c097b80
TS
1005
1006 pxor (%arg1), \XMM1
1007 pxor (%arg1), \XMM2
1008 pxor (%arg1), \XMM3
1009 pxor (%arg1), \XMM4
e5b954e8 1010 movdqu HashKey_4_k(%arg2), \TMP5
d7866e50 1011 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
3c097b80 1012 movaps 0x10(%arg1), \TMP1
d7866e50
UB
1013 aesenc \TMP1, \XMM1 # Round 1
1014 aesenc \TMP1, \XMM2
1015 aesenc \TMP1, \XMM3
1016 aesenc \TMP1, \XMM4
3c097b80 1017 movaps 0x20(%arg1), \TMP1
d7866e50
UB
1018 aesenc \TMP1, \XMM1 # Round 2
1019 aesenc \TMP1, \XMM2
1020 aesenc \TMP1, \XMM3
1021 aesenc \TMP1, \XMM4
3c097b80
TS
1022 movdqa \XMM6, \TMP1
1023 pshufd $78, \XMM6, \TMP2
1024 pxor \XMM6, \TMP2
e5b954e8 1025 movdqu HashKey_3(%arg2), \TMP5
d7866e50 1026 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
3c097b80 1027 movaps 0x30(%arg1), \TMP3
d7866e50
UB
1028 aesenc \TMP3, \XMM1 # Round 3
1029 aesenc \TMP3, \XMM2
1030 aesenc \TMP3, \XMM3
1031 aesenc \TMP3, \XMM4
1032 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
3c097b80 1033 movaps 0x40(%arg1), \TMP3
d7866e50
UB
1034 aesenc \TMP3, \XMM1 # Round 4
1035 aesenc \TMP3, \XMM2
1036 aesenc \TMP3, \XMM3
1037 aesenc \TMP3, \XMM4
e5b954e8 1038 movdqu HashKey_3_k(%arg2), \TMP5
d7866e50 1039 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
3c097b80 1040 movaps 0x50(%arg1), \TMP3
d7866e50
UB
1041 aesenc \TMP3, \XMM1 # Round 5
1042 aesenc \TMP3, \XMM2
1043 aesenc \TMP3, \XMM3
1044 aesenc \TMP3, \XMM4
3c097b80
TS
1045 pxor \TMP1, \TMP4
1046# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1047 pxor \XMM6, \XMM5
1048 pxor \TMP2, \TMP6
1049 movdqa \XMM7, \TMP1
1050 pshufd $78, \XMM7, \TMP2
1051 pxor \XMM7, \TMP2
e5b954e8 1052 movdqu HashKey_2(%arg2), \TMP5
3c097b80
TS
1053
1054 # Multiply TMP5 * HashKey using karatsuba
1055
d7866e50 1056 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
3c097b80 1057 movaps 0x60(%arg1), \TMP3
d7866e50
UB
1058 aesenc \TMP3, \XMM1 # Round 6
1059 aesenc \TMP3, \XMM2
1060 aesenc \TMP3, \XMM3
1061 aesenc \TMP3, \XMM4
1062 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
3c097b80 1063 movaps 0x70(%arg1), \TMP3
d7866e50
UB
1064 aesenc \TMP3, \XMM1 # Round 7
1065 aesenc \TMP3, \XMM2
1066 aesenc \TMP3, \XMM3
1067 aesenc \TMP3, \XMM4
e5b954e8 1068 movdqu HashKey_2_k(%arg2), \TMP5
d7866e50 1069 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
3c097b80 1070 movaps 0x80(%arg1), \TMP3
d7866e50
UB
1071 aesenc \TMP3, \XMM1 # Round 8
1072 aesenc \TMP3, \XMM2
1073 aesenc \TMP3, \XMM3
1074 aesenc \TMP3, \XMM4
3c097b80
TS
1075 pxor \TMP1, \TMP4
1076# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1077 pxor \XMM7, \XMM5
1078 pxor \TMP2, \TMP6
1079
1080 # Multiply XMM8 * HashKey
1081 # XMM8 and TMP5 hold the values for the two operands
1082
1083 movdqa \XMM8, \TMP1
1084 pshufd $78, \XMM8, \TMP2
1085 pxor \XMM8, \TMP2
e5b954e8 1086 movdqu HashKey(%arg2), \TMP5
d7866e50 1087 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
3c097b80 1088 movaps 0x90(%arg1), \TMP3
d7866e50
UB
1089 aesenc \TMP3, \XMM1 # Round 9
1090 aesenc \TMP3, \XMM2
1091 aesenc \TMP3, \XMM3
1092 aesenc \TMP3, \XMM4
1093 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
1094 lea 0xa0(%arg1),%r10
1095 mov keysize,%eax
1096 shr $2,%eax # 128->4, 192->6, 256->8
1097 sub $4,%eax # 128->0, 192->2, 256->4
1d4b0ff3 1098 jz .Laes_loop_par_enc_done\@
e31ac32d 1099
1d4b0ff3 1100.Laes_loop_par_enc\@:
e31ac32d
TM
1101 MOVADQ (%r10),\TMP3
1102.irpc index, 1234
d7866e50 1103 aesenc \TMP3, %xmm\index
e31ac32d
TM
1104.endr
1105 add $16,%r10
1106 sub $1,%eax
1d4b0ff3 1107 jnz .Laes_loop_par_enc\@
e31ac32d 1108
1d4b0ff3 1109.Laes_loop_par_enc_done\@:
e31ac32d 1110 MOVADQ (%r10), \TMP3
d7866e50
UB
1111 aesenclast \TMP3, \XMM1 # Round 10
1112 aesenclast \TMP3, \XMM2
1113 aesenclast \TMP3, \XMM3
1114 aesenclast \TMP3, \XMM4
e5b954e8 1115 movdqu HashKey_k(%arg2), \TMP5
d7866e50 1116 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
9ee4a5df 1117 movdqu (%arg4,%r11,1), \TMP3
3c097b80 1118 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
9ee4a5df 1119 movdqu 16(%arg4,%r11,1), \TMP3
3c097b80 1120 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
9ee4a5df 1121 movdqu 32(%arg4,%r11,1), \TMP3
3c097b80 1122 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
9ee4a5df 1123 movdqu 48(%arg4,%r11,1), \TMP3
3c097b80 1124 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
9ee4a5df
DW
1125 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1126 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1127 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1128 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
d7866e50
UB
1129 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1130 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1131 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1132 pshufb %xmm15, \XMM4 # perform a 16 byte swap
3c097b80
TS
1133
1134 pxor \TMP4, \TMP1
1135 pxor \XMM8, \XMM5
1136 pxor \TMP6, \TMP2
1137 pxor \TMP1, \TMP2
1138 pxor \XMM5, \TMP2
1139 movdqa \TMP2, \TMP3
1140 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1141 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1142 pxor \TMP3, \XMM5
1143 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1144
1145 # first phase of reduction
1146
1147 movdqa \XMM5, \TMP2
1148 movdqa \XMM5, \TMP3
1149 movdqa \XMM5, \TMP4
1150# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1151 pslld $31, \TMP2 # packed right shift << 31
1152 pslld $30, \TMP3 # packed right shift << 30
1153 pslld $25, \TMP4 # packed right shift << 25
1154 pxor \TMP3, \TMP2 # xor the shifted versions
1155 pxor \TMP4, \TMP2
1156 movdqa \TMP2, \TMP5
1157 psrldq $4, \TMP5 # right shift T5 1 DW
1158 pslldq $12, \TMP2 # left shift T2 3 DWs
1159 pxor \TMP2, \XMM5
1160
1161 # second phase of reduction
1162
1163 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1164 movdqa \XMM5,\TMP3
1165 movdqa \XMM5,\TMP4
1166 psrld $1, \TMP2 # packed left shift >>1
1167 psrld $2, \TMP3 # packed left shift >>2
1168 psrld $7, \TMP4 # packed left shift >>7
1169 pxor \TMP3,\TMP2 # xor the shifted versions
1170 pxor \TMP4,\TMP2
1171 pxor \TMP5, \TMP2
1172 pxor \TMP2, \XMM5
1173 pxor \TMP1, \XMM5 # result is in TMP1
1174
1175 pxor \XMM5, \XMM1
1176.endm
1177
1178/*
1179* decrypt 4 blocks at a time
1180* ghash the 4 previously decrypted ciphertext blocks
9ee4a5df 1181* arg1, %arg3, %arg4 are used as pointers only, not modified
3c097b80
TS
1182* %r11 is the data offset value
1183*/
3347c8a0 1184.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
0bd82f5f
TS
1185TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1186
1187 movdqa \XMM1, \XMM5
1188 movdqa \XMM2, \XMM6
1189 movdqa \XMM3, \XMM7
1190 movdqa \XMM4, \XMM8
1191
3c097b80 1192 movdqa SHUF_MASK(%rip), %xmm15
0bd82f5f
TS
1193 # multiply TMP5 * HashKey using karatsuba
1194
1195 movdqa \XMM5, \TMP4
1196 pshufd $78, \XMM5, \TMP6
1197 pxor \XMM5, \TMP6
1198 paddd ONE(%rip), \XMM0 # INCR CNT
e5b954e8 1199 movdqu HashKey_4(%arg2), \TMP5
d7866e50 1200 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
0bd82f5f
TS
1201 movdqa \XMM0, \XMM1
1202 paddd ONE(%rip), \XMM0 # INCR CNT
1203 movdqa \XMM0, \XMM2
1204 paddd ONE(%rip), \XMM0 # INCR CNT
1205 movdqa \XMM0, \XMM3
1206 paddd ONE(%rip), \XMM0 # INCR CNT
1207 movdqa \XMM0, \XMM4
d7866e50
UB
1208 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1209 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1210 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1211 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1212 pshufb %xmm15, \XMM4 # perform a 16 byte swap
3c097b80 1213
0bd82f5f
TS
1214 pxor (%arg1), \XMM1
1215 pxor (%arg1), \XMM2
1216 pxor (%arg1), \XMM3
1217 pxor (%arg1), \XMM4
e5b954e8 1218 movdqu HashKey_4_k(%arg2), \TMP5
d7866e50 1219 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
0bd82f5f 1220 movaps 0x10(%arg1), \TMP1
d7866e50
UB
1221 aesenc \TMP1, \XMM1 # Round 1
1222 aesenc \TMP1, \XMM2
1223 aesenc \TMP1, \XMM3
1224 aesenc \TMP1, \XMM4
0bd82f5f 1225 movaps 0x20(%arg1), \TMP1
d7866e50
UB
1226 aesenc \TMP1, \XMM1 # Round 2
1227 aesenc \TMP1, \XMM2
1228 aesenc \TMP1, \XMM3
1229 aesenc \TMP1, \XMM4
0bd82f5f
TS
1230 movdqa \XMM6, \TMP1
1231 pshufd $78, \XMM6, \TMP2
1232 pxor \XMM6, \TMP2
e5b954e8 1233 movdqu HashKey_3(%arg2), \TMP5
d7866e50 1234 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
0bd82f5f 1235 movaps 0x30(%arg1), \TMP3
d7866e50
UB
1236 aesenc \TMP3, \XMM1 # Round 3
1237 aesenc \TMP3, \XMM2
1238 aesenc \TMP3, \XMM3
1239 aesenc \TMP3, \XMM4
1240 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
0bd82f5f 1241 movaps 0x40(%arg1), \TMP3
d7866e50
UB
1242 aesenc \TMP3, \XMM1 # Round 4
1243 aesenc \TMP3, \XMM2
1244 aesenc \TMP3, \XMM3
1245 aesenc \TMP3, \XMM4
e5b954e8 1246 movdqu HashKey_3_k(%arg2), \TMP5
d7866e50 1247 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
0bd82f5f 1248 movaps 0x50(%arg1), \TMP3
d7866e50
UB
1249 aesenc \TMP3, \XMM1 # Round 5
1250 aesenc \TMP3, \XMM2
1251 aesenc \TMP3, \XMM3
1252 aesenc \TMP3, \XMM4
0bd82f5f
TS
1253 pxor \TMP1, \TMP4
1254# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1255 pxor \XMM6, \XMM5
1256 pxor \TMP2, \TMP6
1257 movdqa \XMM7, \TMP1
1258 pshufd $78, \XMM7, \TMP2
1259 pxor \XMM7, \TMP2
e5b954e8 1260 movdqu HashKey_2(%arg2), \TMP5
0bd82f5f
TS
1261
1262 # Multiply TMP5 * HashKey using karatsuba
1263
d7866e50 1264 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
0bd82f5f 1265 movaps 0x60(%arg1), \TMP3
d7866e50
UB
1266 aesenc \TMP3, \XMM1 # Round 6
1267 aesenc \TMP3, \XMM2
1268 aesenc \TMP3, \XMM3
1269 aesenc \TMP3, \XMM4
1270 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
0bd82f5f 1271 movaps 0x70(%arg1), \TMP3
d7866e50
UB
1272 aesenc \TMP3, \XMM1 # Round 7
1273 aesenc \TMP3, \XMM2
1274 aesenc \TMP3, \XMM3
1275 aesenc \TMP3, \XMM4
e5b954e8 1276 movdqu HashKey_2_k(%arg2), \TMP5
d7866e50 1277 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
0bd82f5f 1278 movaps 0x80(%arg1), \TMP3
d7866e50
UB
1279 aesenc \TMP3, \XMM1 # Round 8
1280 aesenc \TMP3, \XMM2
1281 aesenc \TMP3, \XMM3
1282 aesenc \TMP3, \XMM4
0bd82f5f
TS
1283 pxor \TMP1, \TMP4
1284# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1285 pxor \XMM7, \XMM5
1286 pxor \TMP2, \TMP6
1287
1288 # Multiply XMM8 * HashKey
1289 # XMM8 and TMP5 hold the values for the two operands
1290
1291 movdqa \XMM8, \TMP1
1292 pshufd $78, \XMM8, \TMP2
1293 pxor \XMM8, \TMP2
e5b954e8 1294 movdqu HashKey(%arg2), \TMP5
d7866e50 1295 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
0bd82f5f 1296 movaps 0x90(%arg1), \TMP3
d7866e50
UB
1297 aesenc \TMP3, \XMM1 # Round 9
1298 aesenc \TMP3, \XMM2
1299 aesenc \TMP3, \XMM3
1300 aesenc \TMP3, \XMM4
1301 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
e31ac32d
TM
1302 lea 0xa0(%arg1),%r10
1303 mov keysize,%eax
1304 shr $2,%eax # 128->4, 192->6, 256->8
1305 sub $4,%eax # 128->0, 192->2, 256->4
1d4b0ff3 1306 jz .Laes_loop_par_dec_done\@
e31ac32d 1307
1d4b0ff3 1308.Laes_loop_par_dec\@:
e31ac32d
TM
1309 MOVADQ (%r10),\TMP3
1310.irpc index, 1234
d7866e50 1311 aesenc \TMP3, %xmm\index
e31ac32d
TM
1312.endr
1313 add $16,%r10
1314 sub $1,%eax
1d4b0ff3 1315 jnz .Laes_loop_par_dec\@
e31ac32d 1316
1d4b0ff3 1317.Laes_loop_par_dec_done\@:
e31ac32d 1318 MOVADQ (%r10), \TMP3
d7866e50
UB
1319 aesenclast \TMP3, \XMM1 # last round
1320 aesenclast \TMP3, \XMM2
1321 aesenclast \TMP3, \XMM3
1322 aesenclast \TMP3, \XMM4
e5b954e8 1323 movdqu HashKey_k(%arg2), \TMP5
d7866e50 1324 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
9ee4a5df 1325 movdqu (%arg4,%r11,1), \TMP3
0bd82f5f 1326 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
9ee4a5df 1327 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1328 movdqa \TMP3, \XMM1
9ee4a5df 1329 movdqu 16(%arg4,%r11,1), \TMP3
0bd82f5f 1330 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
9ee4a5df 1331 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1332 movdqa \TMP3, \XMM2
9ee4a5df 1333 movdqu 32(%arg4,%r11,1), \TMP3
0bd82f5f 1334 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
9ee4a5df 1335 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1336 movdqa \TMP3, \XMM3
9ee4a5df 1337 movdqu 48(%arg4,%r11,1), \TMP3
0bd82f5f 1338 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
9ee4a5df 1339 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
0bd82f5f 1340 movdqa \TMP3, \XMM4
d7866e50
UB
1341 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1342 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1343 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1344 pshufb %xmm15, \XMM4 # perform a 16 byte swap
0bd82f5f
TS
1345
1346 pxor \TMP4, \TMP1
1347 pxor \XMM8, \XMM5
1348 pxor \TMP6, \TMP2
1349 pxor \TMP1, \TMP2
1350 pxor \XMM5, \TMP2
1351 movdqa \TMP2, \TMP3
1352 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1353 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1354 pxor \TMP3, \XMM5
1355 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1356
1357 # first phase of reduction
1358
1359 movdqa \XMM5, \TMP2
1360 movdqa \XMM5, \TMP3
1361 movdqa \XMM5, \TMP4
1362# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1363 pslld $31, \TMP2 # packed right shift << 31
1364 pslld $30, \TMP3 # packed right shift << 30
1365 pslld $25, \TMP4 # packed right shift << 25
1366 pxor \TMP3, \TMP2 # xor the shifted versions
1367 pxor \TMP4, \TMP2
1368 movdqa \TMP2, \TMP5
1369 psrldq $4, \TMP5 # right shift T5 1 DW
1370 pslldq $12, \TMP2 # left shift T2 3 DWs
1371 pxor \TMP2, \XMM5
1372
1373 # second phase of reduction
1374
1375 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1376 movdqa \XMM5,\TMP3
1377 movdqa \XMM5,\TMP4
1378 psrld $1, \TMP2 # packed left shift >>1
1379 psrld $2, \TMP3 # packed left shift >>2
1380 psrld $7, \TMP4 # packed left shift >>7
1381 pxor \TMP3,\TMP2 # xor the shifted versions
1382 pxor \TMP4,\TMP2
1383 pxor \TMP5, \TMP2
1384 pxor \TMP2, \XMM5
1385 pxor \TMP1, \XMM5 # result is in TMP1
1386
1387 pxor \XMM5, \XMM1
1388.endm
1389
1390/* GHASH the last 4 ciphertext blocks. */
1391.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1392TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1393
1394 # Multiply TMP6 * HashKey (using Karatsuba)
1395
1396 movdqa \XMM1, \TMP6
1397 pshufd $78, \XMM1, \TMP2
1398 pxor \XMM1, \TMP2
e5b954e8 1399 movdqu HashKey_4(%arg2), \TMP5
d7866e50
UB
1400 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1401 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
e5b954e8 1402 movdqu HashKey_4_k(%arg2), \TMP4
d7866e50 1403 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
0bd82f5f
TS
1404 movdqa \XMM1, \XMMDst
1405 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1406
1407 # Multiply TMP1 * HashKey (using Karatsuba)
1408
1409 movdqa \XMM2, \TMP1
1410 pshufd $78, \XMM2, \TMP2
1411 pxor \XMM2, \TMP2
e5b954e8 1412 movdqu HashKey_3(%arg2), \TMP5
d7866e50
UB
1413 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1414 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
e5b954e8 1415 movdqu HashKey_3_k(%arg2), \TMP4
d7866e50 1416 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
0bd82f5f
TS
1417 pxor \TMP1, \TMP6
1418 pxor \XMM2, \XMMDst
1419 pxor \TMP2, \XMM1
1420# results accumulated in TMP6, XMMDst, XMM1
1421
1422 # Multiply TMP1 * HashKey (using Karatsuba)
1423
1424 movdqa \XMM3, \TMP1
1425 pshufd $78, \XMM3, \TMP2
1426 pxor \XMM3, \TMP2
e5b954e8 1427 movdqu HashKey_2(%arg2), \TMP5
d7866e50
UB
1428 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1429 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
e5b954e8 1430 movdqu HashKey_2_k(%arg2), \TMP4
d7866e50 1431 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
0bd82f5f
TS
1432 pxor \TMP1, \TMP6
1433 pxor \XMM3, \XMMDst
1434 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1435
1436 # Multiply TMP1 * HashKey (using Karatsuba)
1437 movdqa \XMM4, \TMP1
1438 pshufd $78, \XMM4, \TMP2
1439 pxor \XMM4, \TMP2
e5b954e8 1440 movdqu HashKey(%arg2), \TMP5
d7866e50
UB
1441 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1442 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
e5b954e8 1443 movdqu HashKey_k(%arg2), \TMP4
d7866e50 1444 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
0bd82f5f
TS
1445 pxor \TMP1, \TMP6
1446 pxor \XMM4, \XMMDst
1447 pxor \XMM1, \TMP2
1448 pxor \TMP6, \TMP2
1449 pxor \XMMDst, \TMP2
1450 # middle section of the temp results combined as in karatsuba algorithm
1451 movdqa \TMP2, \TMP4
1452 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1453 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1454 pxor \TMP4, \XMMDst
1455 pxor \TMP2, \TMP6
1456# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1457 # first phase of the reduction
1458 movdqa \XMMDst, \TMP2
1459 movdqa \XMMDst, \TMP3
1460 movdqa \XMMDst, \TMP4
1461# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1462 pslld $31, \TMP2 # packed right shifting << 31
1463 pslld $30, \TMP3 # packed right shifting << 30
1464 pslld $25, \TMP4 # packed right shifting << 25
1465 pxor \TMP3, \TMP2 # xor the shifted versions
1466 pxor \TMP4, \TMP2
1467 movdqa \TMP2, \TMP7
1468 psrldq $4, \TMP7 # right shift TMP7 1 DW
1469 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1470 pxor \TMP2, \XMMDst
1471
1472 # second phase of the reduction
1473 movdqa \XMMDst, \TMP2
1474 # make 3 copies of XMMDst for doing 3 shift operations
1475 movdqa \XMMDst, \TMP3
1476 movdqa \XMMDst, \TMP4
1477 psrld $1, \TMP2 # packed left shift >> 1
1478 psrld $2, \TMP3 # packed left shift >> 2
1479 psrld $7, \TMP4 # packed left shift >> 7
1480 pxor \TMP3, \TMP2 # xor the shifted versions
1481 pxor \TMP4, \TMP2
1482 pxor \TMP7, \TMP2
1483 pxor \TMP2, \XMMDst
1484 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1485.endm
1486
0bd82f5f 1487
e31ac32d
TM
1488/* Encryption of a single block
1489* uses eax & r10
1490*/
0bd82f5f 1491
e31ac32d 1492.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
0bd82f5f 1493
e31ac32d
TM
1494 pxor (%arg1), \XMM0
1495 mov keysize,%eax
1496 shr $2,%eax # 128->4, 192->6, 256->8
1497 add $5,%eax # 128->9, 192->11, 256->13
1498 lea 16(%arg1), %r10 # get first expanded key address
1499
1500_esb_loop_\@:
1501 MOVADQ (%r10),\TMP1
d7866e50 1502 aesenc \TMP1,\XMM0
e31ac32d
TM
1503 add $16,%r10
1504 sub $1,%eax
1505 jnz _esb_loop_\@
1506
1507 MOVADQ (%r10),\TMP1
d7866e50 1508 aesenclast \TMP1,\XMM0
e31ac32d 1509.endm
0bd82f5f
TS
1510/*****************************************************************************
1511* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
9ee4a5df
DW
1512* struct gcm_context_data *data
1513* // Context data
0bd82f5f
TS
1514* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1515* const u8 *in, // Ciphertext input
1516* u64 plaintext_len, // Length of data in bytes for decryption.
1517* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1518* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1519* // concatenated with 0x00000001. 16-byte aligned pointer.
1520* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1521* const u8 *aad, // Additional Authentication Data (AAD)
1522* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1523* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1524* // given authentication tag and only return the plaintext if they match.
1525* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1526* // (most likely), 12 or 8.
1527*
1528* Assumptions:
1529*
1530* keys:
1531* keys are pre-expanded and aligned to 16 bytes. we are using the first
1532* set of 11 keys in the data structure void *aes_ctx
1533*
1534* iv:
1535* 0 1 2 3
1536* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1537* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1538* | Salt (From the SA) |
1539* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1540* | Initialization Vector |
1541* | (This is the sequence number from IPSec header) |
1542* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543* | 0x1 |
1544* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545*
1546*
1547*
1548* AAD:
1549* AAD padded to 128 bits with 0
1550* for example, assume AAD is a u32 vector
1551*
1552* if AAD is 8 bytes:
1553* AAD[3] = {A0, A1};
1554* padded AAD in xmm register = {A1 A0 0 0}
1555*
1556* 0 1 2 3
1557* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1558* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1559* | SPI (A1) |
1560* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1561* | 32-bit Sequence Number (A0) |
1562* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563* | 0x0 |
1564* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565*
1566* AAD Format with 32-bit Sequence Number
1567*
1568* if AAD is 12 bytes:
1569* AAD[3] = {A0, A1, A2};
1570* padded AAD in xmm register = {A2 A1 A0 0}
1571*
1572* 0 1 2 3
1573* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1574* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1576* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1577* | SPI (A2) |
1578* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579* | 64-bit Extended Sequence Number {A1,A0} |
1580* | |
1581* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582* | 0x0 |
1583* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584*
1585* AAD Format with 64-bit Extended Sequence Number
1586*
0bd82f5f
TS
1587* poly = x^128 + x^127 + x^126 + x^121 + 1
1588*
1589*****************************************************************************/
6dcc5627 1590SYM_FUNC_START(aesni_gcm_dec)
6c2c86b3 1591 FUNC_SAVE
0bd82f5f 1592
fb8986e6 1593 GCM_INIT %arg6, arg7, arg8, arg9
ba45833e 1594 GCM_ENC_DEC dec
fb8986e6 1595 GCM_COMPLETE arg10, arg11
6c2c86b3 1596 FUNC_RESTORE
f94909ce 1597 RET
6dcc5627 1598SYM_FUNC_END(aesni_gcm_dec)
0bd82f5f
TS
1599
1600
1601/*****************************************************************************
1602* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
9ee4a5df
DW
1603* struct gcm_context_data *data
1604* // Context data
0bd82f5f
TS
1605* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1606* const u8 *in, // Plaintext input
1607* u64 plaintext_len, // Length of data in bytes for encryption.
1608* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1609* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1610* // concatenated with 0x00000001. 16-byte aligned pointer.
1611* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1612* const u8 *aad, // Additional Authentication Data (AAD)
1613* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1614* u8 *auth_tag, // Authenticated Tag output.
1615* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1616* // 12 or 8.
1617*
1618* Assumptions:
1619*
1620* keys:
1621* keys are pre-expanded and aligned to 16 bytes. we are using the
1622* first set of 11 keys in the data structure void *aes_ctx
1623*
1624*
1625* iv:
1626* 0 1 2 3
1627* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1628* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1629* | Salt (From the SA) |
1630* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1631* | Initialization Vector |
1632* | (This is the sequence number from IPSec header) |
1633* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634* | 0x1 |
1635* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636*
1637*
1638*
1639* AAD:
1640* AAD padded to 128 bits with 0
1641* for example, assume AAD is a u32 vector
1642*
1643* if AAD is 8 bytes:
1644* AAD[3] = {A0, A1};
1645* padded AAD in xmm register = {A1 A0 0 0}
1646*
1647* 0 1 2 3
1648* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1649* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1650* | SPI (A1) |
1651* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1652* | 32-bit Sequence Number (A0) |
1653* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654* | 0x0 |
1655* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656*
1657* AAD Format with 32-bit Sequence Number
1658*
1659* if AAD is 12 bytes:
1660* AAD[3] = {A0, A1, A2};
1661* padded AAD in xmm register = {A2 A1 A0 0}
1662*
1663* 0 1 2 3
1664* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1665* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1666* | SPI (A2) |
1667* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1668* | 64-bit Extended Sequence Number {A1,A0} |
1669* | |
1670* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671* | 0x0 |
1672* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673*
1674* AAD Format with 64-bit Extended Sequence Number
1675*
0bd82f5f
TS
1676* poly = x^128 + x^127 + x^126 + x^121 + 1
1677***************************************************************************/
6dcc5627 1678SYM_FUNC_START(aesni_gcm_enc)
6c2c86b3 1679 FUNC_SAVE
0bd82f5f 1680
fb8986e6 1681 GCM_INIT %arg6, arg7, arg8, arg9
ba45833e 1682 GCM_ENC_DEC enc
fb8986e6
DW
1683
1684 GCM_COMPLETE arg10, arg11
6c2c86b3 1685 FUNC_RESTORE
f94909ce 1686 RET
6dcc5627 1687SYM_FUNC_END(aesni_gcm_enc)
3c097b80 1688
fb8986e6
DW
1689/*****************************************************************************
1690* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1691* struct gcm_context_data *data,
1692* // context data
1693* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1694* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1695* // concatenated with 0x00000001. 16-byte aligned pointer.
1696* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1697* const u8 *aad, // Additional Authentication Data (AAD)
1698* u64 aad_len) // Length of AAD in bytes.
1699*/
6dcc5627 1700SYM_FUNC_START(aesni_gcm_init)
fb8986e6
DW
1701 FUNC_SAVE
1702 GCM_INIT %arg3, %arg4,%arg5, %arg6
1703 FUNC_RESTORE
f94909ce 1704 RET
6dcc5627 1705SYM_FUNC_END(aesni_gcm_init)
fb8986e6
DW
1706
1707/*****************************************************************************
1708* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1709* struct gcm_context_data *data,
1710* // context data
1711* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1712* const u8 *in, // Plaintext input
1713* u64 plaintext_len, // Length of data in bytes for encryption.
1714*/
6dcc5627 1715SYM_FUNC_START(aesni_gcm_enc_update)
fb8986e6
DW
1716 FUNC_SAVE
1717 GCM_ENC_DEC enc
1718 FUNC_RESTORE
f94909ce 1719 RET
6dcc5627 1720SYM_FUNC_END(aesni_gcm_enc_update)
fb8986e6
DW
1721
1722/*****************************************************************************
1723* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1724* struct gcm_context_data *data,
1725* // context data
1726* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1727* const u8 *in, // Plaintext input
1728* u64 plaintext_len, // Length of data in bytes for encryption.
1729*/
6dcc5627 1730SYM_FUNC_START(aesni_gcm_dec_update)
fb8986e6
DW
1731 FUNC_SAVE
1732 GCM_ENC_DEC dec
1733 FUNC_RESTORE
f94909ce 1734 RET
6dcc5627 1735SYM_FUNC_END(aesni_gcm_dec_update)
fb8986e6
DW
1736
1737/*****************************************************************************
1738* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1739* struct gcm_context_data *data,
1740* // context data
1741* u8 *auth_tag, // Authenticated Tag output.
1742* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1743* // 12 or 8.
1744*/
6dcc5627 1745SYM_FUNC_START(aesni_gcm_finalize)
fb8986e6
DW
1746 FUNC_SAVE
1747 GCM_COMPLETE %arg3 %arg4
1748 FUNC_RESTORE
f94909ce 1749 RET
6dcc5627 1750SYM_FUNC_END(aesni_gcm_finalize)
fb8986e6 1751
559ad0ff 1752#endif
0bd82f5f 1753
74d8b90a 1754SYM_FUNC_START_LOCAL(_key_expansion_256a)
54b6a1bd
HY
1755 pshufd $0b11111111, %xmm1, %xmm1
1756 shufps $0b00010000, %xmm0, %xmm4
1757 pxor %xmm4, %xmm0
1758 shufps $0b10001100, %xmm0, %xmm4
1759 pxor %xmm4, %xmm0
1760 pxor %xmm1, %xmm0
0d258efb
MK
1761 movaps %xmm0, (TKEYP)
1762 add $0x10, TKEYP
f94909ce 1763 RET
74d8b90a 1764SYM_FUNC_END(_key_expansion_256a)
7be2e319 1765SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
54b6a1bd 1766
74d8b90a 1767SYM_FUNC_START_LOCAL(_key_expansion_192a)
54b6a1bd
HY
1768 pshufd $0b01010101, %xmm1, %xmm1
1769 shufps $0b00010000, %xmm0, %xmm4
1770 pxor %xmm4, %xmm0
1771 shufps $0b10001100, %xmm0, %xmm4
1772 pxor %xmm4, %xmm0
1773 pxor %xmm1, %xmm0
1774
1775 movaps %xmm2, %xmm5
1776 movaps %xmm2, %xmm6
1777 pslldq $4, %xmm5
1778 pshufd $0b11111111, %xmm0, %xmm3
1779 pxor %xmm3, %xmm2
1780 pxor %xmm5, %xmm2
1781
1782 movaps %xmm0, %xmm1
1783 shufps $0b01000100, %xmm0, %xmm6
0d258efb 1784 movaps %xmm6, (TKEYP)
54b6a1bd 1785 shufps $0b01001110, %xmm2, %xmm1
0d258efb
MK
1786 movaps %xmm1, 0x10(TKEYP)
1787 add $0x20, TKEYP
f94909ce 1788 RET
74d8b90a 1789SYM_FUNC_END(_key_expansion_192a)
54b6a1bd 1790
74d8b90a 1791SYM_FUNC_START_LOCAL(_key_expansion_192b)
54b6a1bd
HY
1792 pshufd $0b01010101, %xmm1, %xmm1
1793 shufps $0b00010000, %xmm0, %xmm4
1794 pxor %xmm4, %xmm0
1795 shufps $0b10001100, %xmm0, %xmm4
1796 pxor %xmm4, %xmm0
1797 pxor %xmm1, %xmm0
1798
1799 movaps %xmm2, %xmm5
1800 pslldq $4, %xmm5
1801 pshufd $0b11111111, %xmm0, %xmm3
1802 pxor %xmm3, %xmm2
1803 pxor %xmm5, %xmm2
1804
0d258efb
MK
1805 movaps %xmm0, (TKEYP)
1806 add $0x10, TKEYP
f94909ce 1807 RET
74d8b90a 1808SYM_FUNC_END(_key_expansion_192b)
54b6a1bd 1809
74d8b90a 1810SYM_FUNC_START_LOCAL(_key_expansion_256b)
54b6a1bd
HY
1811 pshufd $0b10101010, %xmm1, %xmm1
1812 shufps $0b00010000, %xmm2, %xmm4
1813 pxor %xmm4, %xmm2
1814 shufps $0b10001100, %xmm2, %xmm4
1815 pxor %xmm4, %xmm2
1816 pxor %xmm1, %xmm2
0d258efb
MK
1817 movaps %xmm2, (TKEYP)
1818 add $0x10, TKEYP
f94909ce 1819 RET
74d8b90a 1820SYM_FUNC_END(_key_expansion_256b)
54b6a1bd
HY
1821
1822/*
1823 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1824 * unsigned int key_len)
1825 */
6dcc5627 1826SYM_FUNC_START(aesni_set_key)
8691ccd7 1827 FRAME_BEGIN
0d258efb
MK
1828#ifndef __x86_64__
1829 pushl KEYP
8691ccd7
JP
1830 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1831 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1832 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
0d258efb
MK
1833#endif
1834 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1835 movaps %xmm0, (KEYP)
1836 lea 0x10(KEYP), TKEYP # key addr
1837 movl %edx, 480(KEYP)
54b6a1bd
HY
1838 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1839 cmp $24, %dl
1840 jb .Lenc_key128
1841 je .Lenc_key192
0d258efb
MK
1842 movups 0x10(UKEYP), %xmm2 # other user key
1843 movaps %xmm2, (TKEYP)
1844 add $0x10, TKEYP
d7866e50 1845 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
54b6a1bd 1846 call _key_expansion_256a
d7866e50 1847 aeskeygenassist $0x1, %xmm0, %xmm1
54b6a1bd 1848 call _key_expansion_256b
d7866e50 1849 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
54b6a1bd 1850 call _key_expansion_256a
d7866e50 1851 aeskeygenassist $0x2, %xmm0, %xmm1
54b6a1bd 1852 call _key_expansion_256b
d7866e50 1853 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
54b6a1bd 1854 call _key_expansion_256a
d7866e50 1855 aeskeygenassist $0x4, %xmm0, %xmm1
54b6a1bd 1856 call _key_expansion_256b
d7866e50 1857 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
54b6a1bd 1858 call _key_expansion_256a
d7866e50 1859 aeskeygenassist $0x8, %xmm0, %xmm1
54b6a1bd 1860 call _key_expansion_256b
d7866e50 1861 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
54b6a1bd 1862 call _key_expansion_256a
d7866e50 1863 aeskeygenassist $0x10, %xmm0, %xmm1
54b6a1bd 1864 call _key_expansion_256b
d7866e50 1865 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
54b6a1bd 1866 call _key_expansion_256a
d7866e50 1867 aeskeygenassist $0x20, %xmm0, %xmm1
54b6a1bd 1868 call _key_expansion_256b
d7866e50 1869 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
54b6a1bd
HY
1870 call _key_expansion_256a
1871 jmp .Ldec_key
1872.Lenc_key192:
0d258efb 1873 movq 0x10(UKEYP), %xmm2 # other user key
d7866e50 1874 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
54b6a1bd 1875 call _key_expansion_192a
d7866e50 1876 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
54b6a1bd 1877 call _key_expansion_192b
d7866e50 1878 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
54b6a1bd 1879 call _key_expansion_192a
d7866e50 1880 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
54b6a1bd 1881 call _key_expansion_192b
d7866e50 1882 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
54b6a1bd 1883 call _key_expansion_192a
d7866e50 1884 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
54b6a1bd 1885 call _key_expansion_192b
d7866e50 1886 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
54b6a1bd 1887 call _key_expansion_192a
d7866e50 1888 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
54b6a1bd
HY
1889 call _key_expansion_192b
1890 jmp .Ldec_key
1891.Lenc_key128:
d7866e50 1892 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
54b6a1bd 1893 call _key_expansion_128
d7866e50 1894 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
54b6a1bd 1895 call _key_expansion_128
d7866e50 1896 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
54b6a1bd 1897 call _key_expansion_128
d7866e50 1898 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
54b6a1bd 1899 call _key_expansion_128
d7866e50 1900 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
54b6a1bd 1901 call _key_expansion_128
d7866e50 1902 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
54b6a1bd 1903 call _key_expansion_128
d7866e50 1904 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
54b6a1bd 1905 call _key_expansion_128
d7866e50 1906 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
54b6a1bd 1907 call _key_expansion_128
d7866e50 1908 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
54b6a1bd 1909 call _key_expansion_128
d7866e50 1910 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
54b6a1bd
HY
1911 call _key_expansion_128
1912.Ldec_key:
0d258efb
MK
1913 sub $0x10, TKEYP
1914 movaps (KEYP), %xmm0
1915 movaps (TKEYP), %xmm1
1916 movaps %xmm0, 240(TKEYP)
1917 movaps %xmm1, 240(KEYP)
1918 add $0x10, KEYP
1919 lea 240-16(TKEYP), UKEYP
54b6a1bd
HY
1920.align 4
1921.Ldec_key_loop:
0d258efb 1922 movaps (KEYP), %xmm0
d7866e50 1923 aesimc %xmm0, %xmm1
0d258efb
MK
1924 movaps %xmm1, (UKEYP)
1925 add $0x10, KEYP
1926 sub $0x10, UKEYP
1927 cmp TKEYP, KEYP
54b6a1bd 1928 jb .Ldec_key_loop
0d258efb
MK
1929 xor AREG, AREG
1930#ifndef __x86_64__
1931 popl KEYP
1932#endif
8691ccd7 1933 FRAME_END
f94909ce 1934 RET
6dcc5627 1935SYM_FUNC_END(aesni_set_key)
54b6a1bd
HY
1936
1937/*
9c1e8836 1938 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
54b6a1bd 1939 */
6dcc5627 1940SYM_FUNC_START(aesni_enc)
8691ccd7 1941 FRAME_BEGIN
0d258efb
MK
1942#ifndef __x86_64__
1943 pushl KEYP
1944 pushl KLEN
8691ccd7
JP
1945 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1946 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1947 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 1948#endif
54b6a1bd
HY
1949 movl 480(KEYP), KLEN # key length
1950 movups (INP), STATE # input
1951 call _aesni_enc1
1952 movups STATE, (OUTP) # output
0d258efb
MK
1953#ifndef __x86_64__
1954 popl KLEN
1955 popl KEYP
1956#endif
8691ccd7 1957 FRAME_END
f94909ce 1958 RET
6dcc5627 1959SYM_FUNC_END(aesni_enc)
54b6a1bd
HY
1960
1961/*
1962 * _aesni_enc1: internal ABI
1963 * input:
1964 * KEYP: key struct pointer
1965 * KLEN: round count
1966 * STATE: initial state (input)
1967 * output:
1968 * STATE: finial state (output)
1969 * changed:
1970 * KEY
1971 * TKEYP (T1)
1972 */
74d8b90a 1973SYM_FUNC_START_LOCAL(_aesni_enc1)
54b6a1bd
HY
1974 movaps (KEYP), KEY # key
1975 mov KEYP, TKEYP
1976 pxor KEY, STATE # round 0
1977 add $0x30, TKEYP
1978 cmp $24, KLEN
1979 jb .Lenc128
1980 lea 0x20(TKEYP), TKEYP
1981 je .Lenc192
1982 add $0x20, TKEYP
1983 movaps -0x60(TKEYP), KEY
d7866e50 1984 aesenc KEY, STATE
54b6a1bd 1985 movaps -0x50(TKEYP), KEY
d7866e50 1986 aesenc KEY, STATE
54b6a1bd
HY
1987.align 4
1988.Lenc192:
1989 movaps -0x40(TKEYP), KEY
d7866e50 1990 aesenc KEY, STATE
54b6a1bd 1991 movaps -0x30(TKEYP), KEY
d7866e50 1992 aesenc KEY, STATE
54b6a1bd
HY
1993.align 4
1994.Lenc128:
1995 movaps -0x20(TKEYP), KEY
d7866e50 1996 aesenc KEY, STATE
54b6a1bd 1997 movaps -0x10(TKEYP), KEY
d7866e50 1998 aesenc KEY, STATE
54b6a1bd 1999 movaps (TKEYP), KEY
d7866e50 2000 aesenc KEY, STATE
54b6a1bd 2001 movaps 0x10(TKEYP), KEY
d7866e50 2002 aesenc KEY, STATE
54b6a1bd 2003 movaps 0x20(TKEYP), KEY
d7866e50 2004 aesenc KEY, STATE
54b6a1bd 2005 movaps 0x30(TKEYP), KEY
d7866e50 2006 aesenc KEY, STATE
54b6a1bd 2007 movaps 0x40(TKEYP), KEY
d7866e50 2008 aesenc KEY, STATE
54b6a1bd 2009 movaps 0x50(TKEYP), KEY
d7866e50 2010 aesenc KEY, STATE
54b6a1bd 2011 movaps 0x60(TKEYP), KEY
d7866e50 2012 aesenc KEY, STATE
54b6a1bd 2013 movaps 0x70(TKEYP), KEY
d7866e50 2014 aesenclast KEY, STATE
f94909ce 2015 RET
74d8b90a 2016SYM_FUNC_END(_aesni_enc1)
54b6a1bd
HY
2017
2018/*
2019 * _aesni_enc4: internal ABI
2020 * input:
2021 * KEYP: key struct pointer
2022 * KLEN: round count
2023 * STATE1: initial state (input)
2024 * STATE2
2025 * STATE3
2026 * STATE4
2027 * output:
2028 * STATE1: finial state (output)
2029 * STATE2
2030 * STATE3
2031 * STATE4
2032 * changed:
2033 * KEY
2034 * TKEYP (T1)
2035 */
74d8b90a 2036SYM_FUNC_START_LOCAL(_aesni_enc4)
54b6a1bd
HY
2037 movaps (KEYP), KEY # key
2038 mov KEYP, TKEYP
2039 pxor KEY, STATE1 # round 0
2040 pxor KEY, STATE2
2041 pxor KEY, STATE3
2042 pxor KEY, STATE4
2043 add $0x30, TKEYP
2044 cmp $24, KLEN
2045 jb .L4enc128
2046 lea 0x20(TKEYP), TKEYP
2047 je .L4enc192
2048 add $0x20, TKEYP
2049 movaps -0x60(TKEYP), KEY
d7866e50
UB
2050 aesenc KEY, STATE1
2051 aesenc KEY, STATE2
2052 aesenc KEY, STATE3
2053 aesenc KEY, STATE4
54b6a1bd 2054 movaps -0x50(TKEYP), KEY
d7866e50
UB
2055 aesenc KEY, STATE1
2056 aesenc KEY, STATE2
2057 aesenc KEY, STATE3
2058 aesenc KEY, STATE4
54b6a1bd
HY
2059#.align 4
2060.L4enc192:
2061 movaps -0x40(TKEYP), KEY
d7866e50
UB
2062 aesenc KEY, STATE1
2063 aesenc KEY, STATE2
2064 aesenc KEY, STATE3
2065 aesenc KEY, STATE4
54b6a1bd 2066 movaps -0x30(TKEYP), KEY
d7866e50
UB
2067 aesenc KEY, STATE1
2068 aesenc KEY, STATE2
2069 aesenc KEY, STATE3
2070 aesenc KEY, STATE4
54b6a1bd
HY
2071#.align 4
2072.L4enc128:
2073 movaps -0x20(TKEYP), KEY
d7866e50
UB
2074 aesenc KEY, STATE1
2075 aesenc KEY, STATE2
2076 aesenc KEY, STATE3
2077 aesenc KEY, STATE4
54b6a1bd 2078 movaps -0x10(TKEYP), KEY
d7866e50
UB
2079 aesenc KEY, STATE1
2080 aesenc KEY, STATE2
2081 aesenc KEY, STATE3
2082 aesenc KEY, STATE4
54b6a1bd 2083 movaps (TKEYP), KEY
d7866e50
UB
2084 aesenc KEY, STATE1
2085 aesenc KEY, STATE2
2086 aesenc KEY, STATE3
2087 aesenc KEY, STATE4
54b6a1bd 2088 movaps 0x10(TKEYP), KEY
d7866e50
UB
2089 aesenc KEY, STATE1
2090 aesenc KEY, STATE2
2091 aesenc KEY, STATE3
2092 aesenc KEY, STATE4
54b6a1bd 2093 movaps 0x20(TKEYP), KEY
d7866e50
UB
2094 aesenc KEY, STATE1
2095 aesenc KEY, STATE2
2096 aesenc KEY, STATE3
2097 aesenc KEY, STATE4
54b6a1bd 2098 movaps 0x30(TKEYP), KEY
d7866e50
UB
2099 aesenc KEY, STATE1
2100 aesenc KEY, STATE2
2101 aesenc KEY, STATE3
2102 aesenc KEY, STATE4
54b6a1bd 2103 movaps 0x40(TKEYP), KEY
d7866e50
UB
2104 aesenc KEY, STATE1
2105 aesenc KEY, STATE2
2106 aesenc KEY, STATE3
2107 aesenc KEY, STATE4
54b6a1bd 2108 movaps 0x50(TKEYP), KEY
d7866e50
UB
2109 aesenc KEY, STATE1
2110 aesenc KEY, STATE2
2111 aesenc KEY, STATE3
2112 aesenc KEY, STATE4
54b6a1bd 2113 movaps 0x60(TKEYP), KEY
d7866e50
UB
2114 aesenc KEY, STATE1
2115 aesenc KEY, STATE2
2116 aesenc KEY, STATE3
2117 aesenc KEY, STATE4
54b6a1bd 2118 movaps 0x70(TKEYP), KEY
d7866e50
UB
2119 aesenclast KEY, STATE1 # last round
2120 aesenclast KEY, STATE2
2121 aesenclast KEY, STATE3
2122 aesenclast KEY, STATE4
f94909ce 2123 RET
74d8b90a 2124SYM_FUNC_END(_aesni_enc4)
54b6a1bd
HY
2125
2126/*
9c1e8836 2127 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
54b6a1bd 2128 */
6dcc5627 2129SYM_FUNC_START(aesni_dec)
8691ccd7 2130 FRAME_BEGIN
0d258efb
MK
2131#ifndef __x86_64__
2132 pushl KEYP
2133 pushl KLEN
8691ccd7
JP
2134 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2135 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2136 movl (FRAME_OFFSET+20)(%esp), INP # src
0d258efb 2137#endif
54b6a1bd
HY
2138 mov 480(KEYP), KLEN # key length
2139 add $240, KEYP
2140 movups (INP), STATE # input
2141 call _aesni_dec1
2142 movups STATE, (OUTP) #output
0d258efb
MK
2143#ifndef __x86_64__
2144 popl KLEN
2145 popl KEYP
2146#endif
8691ccd7 2147 FRAME_END
f94909ce 2148 RET
6dcc5627 2149SYM_FUNC_END(aesni_dec)
54b6a1bd
HY
2150
2151/*
2152 * _aesni_dec1: internal ABI
2153 * input:
2154 * KEYP: key struct pointer
2155 * KLEN: key length
2156 * STATE: initial state (input)
2157 * output:
2158 * STATE: finial state (output)
2159 * changed:
2160 * KEY
2161 * TKEYP (T1)
2162 */
74d8b90a 2163SYM_FUNC_START_LOCAL(_aesni_dec1)
54b6a1bd
HY
2164 movaps (KEYP), KEY # key
2165 mov KEYP, TKEYP
2166 pxor KEY, STATE # round 0
2167 add $0x30, TKEYP
2168 cmp $24, KLEN
2169 jb .Ldec128
2170 lea 0x20(TKEYP), TKEYP
2171 je .Ldec192
2172 add $0x20, TKEYP
2173 movaps -0x60(TKEYP), KEY
d7866e50 2174 aesdec KEY, STATE
54b6a1bd 2175 movaps -0x50(TKEYP), KEY
d7866e50 2176 aesdec KEY, STATE
54b6a1bd
HY
2177.align 4
2178.Ldec192:
2179 movaps -0x40(TKEYP), KEY
d7866e50 2180 aesdec KEY, STATE
54b6a1bd 2181 movaps -0x30(TKEYP), KEY
d7866e50 2182 aesdec KEY, STATE
54b6a1bd
HY
2183.align 4
2184.Ldec128:
2185 movaps -0x20(TKEYP), KEY
d7866e50 2186 aesdec KEY, STATE
54b6a1bd 2187 movaps -0x10(TKEYP), KEY
d7866e50 2188 aesdec KEY, STATE
54b6a1bd 2189 movaps (TKEYP), KEY
d7866e50 2190 aesdec KEY, STATE
54b6a1bd 2191 movaps 0x10(TKEYP), KEY
d7866e50 2192 aesdec KEY, STATE
54b6a1bd 2193 movaps 0x20(TKEYP), KEY
d7866e50 2194 aesdec KEY, STATE
54b6a1bd 2195 movaps 0x30(TKEYP), KEY
d7866e50 2196 aesdec KEY, STATE
54b6a1bd 2197 movaps 0x40(TKEYP), KEY
d7866e50 2198 aesdec KEY, STATE
54b6a1bd 2199 movaps 0x50(TKEYP), KEY
d7866e50 2200 aesdec KEY, STATE
54b6a1bd 2201 movaps 0x60(TKEYP), KEY
d7866e50 2202 aesdec KEY, STATE
54b6a1bd 2203 movaps 0x70(TKEYP), KEY
d7866e50 2204 aesdeclast KEY, STATE
f94909ce 2205 RET
74d8b90a 2206SYM_FUNC_END(_aesni_dec1)
54b6a1bd
HY
2207
2208/*
2209 * _aesni_dec4: internal ABI
2210 * input:
2211 * KEYP: key struct pointer
2212 * KLEN: key length
2213 * STATE1: initial state (input)
2214 * STATE2
2215 * STATE3
2216 * STATE4
2217 * output:
2218 * STATE1: finial state (output)
2219 * STATE2
2220 * STATE3
2221 * STATE4
2222 * changed:
2223 * KEY
2224 * TKEYP (T1)
2225 */
74d8b90a 2226SYM_FUNC_START_LOCAL(_aesni_dec4)
54b6a1bd
HY
2227 movaps (KEYP), KEY # key
2228 mov KEYP, TKEYP
2229 pxor KEY, STATE1 # round 0
2230 pxor KEY, STATE2
2231 pxor KEY, STATE3
2232 pxor KEY, STATE4
2233 add $0x30, TKEYP
2234 cmp $24, KLEN
2235 jb .L4dec128
2236 lea 0x20(TKEYP), TKEYP
2237 je .L4dec192
2238 add $0x20, TKEYP
2239 movaps -0x60(TKEYP), KEY
d7866e50
UB
2240 aesdec KEY, STATE1
2241 aesdec KEY, STATE2
2242 aesdec KEY, STATE3
2243 aesdec KEY, STATE4
54b6a1bd 2244 movaps -0x50(TKEYP), KEY
d7866e50
UB
2245 aesdec KEY, STATE1
2246 aesdec KEY, STATE2
2247 aesdec KEY, STATE3
2248 aesdec KEY, STATE4
54b6a1bd
HY
2249.align 4
2250.L4dec192:
2251 movaps -0x40(TKEYP), KEY
d7866e50
UB
2252 aesdec KEY, STATE1
2253 aesdec KEY, STATE2
2254 aesdec KEY, STATE3
2255 aesdec KEY, STATE4
54b6a1bd 2256 movaps -0x30(TKEYP), KEY
d7866e50
UB
2257 aesdec KEY, STATE1
2258 aesdec KEY, STATE2
2259 aesdec KEY, STATE3
2260 aesdec KEY, STATE4
54b6a1bd
HY
2261.align 4
2262.L4dec128:
2263 movaps -0x20(TKEYP), KEY
d7866e50
UB
2264 aesdec KEY, STATE1
2265 aesdec KEY, STATE2
2266 aesdec KEY, STATE3
2267 aesdec KEY, STATE4
54b6a1bd 2268 movaps -0x10(TKEYP), KEY
d7866e50
UB
2269 aesdec KEY, STATE1
2270 aesdec KEY, STATE2
2271 aesdec KEY, STATE3
2272 aesdec KEY, STATE4
54b6a1bd 2273 movaps (TKEYP), KEY
d7866e50
UB
2274 aesdec KEY, STATE1
2275 aesdec KEY, STATE2
2276 aesdec KEY, STATE3
2277 aesdec KEY, STATE4
54b6a1bd 2278 movaps 0x10(TKEYP), KEY
d7866e50
UB
2279 aesdec KEY, STATE1
2280 aesdec KEY, STATE2
2281 aesdec KEY, STATE3
2282 aesdec KEY, STATE4
54b6a1bd 2283 movaps 0x20(TKEYP), KEY
d7866e50
UB
2284 aesdec KEY, STATE1
2285 aesdec KEY, STATE2
2286 aesdec KEY, STATE3
2287 aesdec KEY, STATE4
54b6a1bd 2288 movaps 0x30(TKEYP), KEY
d7866e50
UB
2289 aesdec KEY, STATE1
2290 aesdec KEY, STATE2
2291 aesdec KEY, STATE3
2292 aesdec KEY, STATE4
54b6a1bd 2293 movaps 0x40(TKEYP), KEY
d7866e50
UB
2294 aesdec KEY, STATE1
2295 aesdec KEY, STATE2
2296 aesdec KEY, STATE3
2297 aesdec KEY, STATE4
54b6a1bd 2298 movaps 0x50(TKEYP), KEY
d7866e50
UB
2299 aesdec KEY, STATE1
2300 aesdec KEY, STATE2
2301 aesdec KEY, STATE3
2302 aesdec KEY, STATE4
54b6a1bd 2303 movaps 0x60(TKEYP), KEY
d7866e50
UB
2304 aesdec KEY, STATE1
2305 aesdec KEY, STATE2
2306 aesdec KEY, STATE3
2307 aesdec KEY, STATE4
54b6a1bd 2308 movaps 0x70(TKEYP), KEY
d7866e50
UB
2309 aesdeclast KEY, STATE1 # last round
2310 aesdeclast KEY, STATE2
2311 aesdeclast KEY, STATE3
2312 aesdeclast KEY, STATE4
f94909ce 2313 RET
74d8b90a 2314SYM_FUNC_END(_aesni_dec4)
54b6a1bd
HY
2315
2316/*
2317 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318 * size_t len)
2319 */
6dcc5627 2320SYM_FUNC_START(aesni_ecb_enc)
8691ccd7 2321 FRAME_BEGIN
0d258efb
MK
2322#ifndef __x86_64__
2323 pushl LEN
2324 pushl KEYP
2325 pushl KLEN
8691ccd7
JP
2326 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2327 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2328 movl (FRAME_OFFSET+24)(%esp), INP # src
2329 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2330#endif
54b6a1bd
HY
2331 test LEN, LEN # check length
2332 jz .Lecb_enc_ret
2333 mov 480(KEYP), KLEN
2334 cmp $16, LEN
2335 jb .Lecb_enc_ret
2336 cmp $64, LEN
2337 jb .Lecb_enc_loop1
2338.align 4
2339.Lecb_enc_loop4:
2340 movups (INP), STATE1
2341 movups 0x10(INP), STATE2
2342 movups 0x20(INP), STATE3
2343 movups 0x30(INP), STATE4
2344 call _aesni_enc4
2345 movups STATE1, (OUTP)
2346 movups STATE2, 0x10(OUTP)
2347 movups STATE3, 0x20(OUTP)
2348 movups STATE4, 0x30(OUTP)
2349 sub $64, LEN
2350 add $64, INP
2351 add $64, OUTP
2352 cmp $64, LEN
2353 jge .Lecb_enc_loop4
2354 cmp $16, LEN
2355 jb .Lecb_enc_ret
2356.align 4
2357.Lecb_enc_loop1:
2358 movups (INP), STATE1
2359 call _aesni_enc1
2360 movups STATE1, (OUTP)
2361 sub $16, LEN
2362 add $16, INP
2363 add $16, OUTP
2364 cmp $16, LEN
2365 jge .Lecb_enc_loop1
2366.Lecb_enc_ret:
0d258efb
MK
2367#ifndef __x86_64__
2368 popl KLEN
2369 popl KEYP
2370 popl LEN
2371#endif
8691ccd7 2372 FRAME_END
f94909ce 2373 RET
6dcc5627 2374SYM_FUNC_END(aesni_ecb_enc)
54b6a1bd
HY
2375
2376/*
2377 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2378 * size_t len);
2379 */
6dcc5627 2380SYM_FUNC_START(aesni_ecb_dec)
8691ccd7 2381 FRAME_BEGIN
0d258efb
MK
2382#ifndef __x86_64__
2383 pushl LEN
2384 pushl KEYP
2385 pushl KLEN
8691ccd7
JP
2386 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2387 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2388 movl (FRAME_OFFSET+24)(%esp), INP # src
2389 movl (FRAME_OFFSET+28)(%esp), LEN # len
0d258efb 2390#endif
54b6a1bd
HY
2391 test LEN, LEN
2392 jz .Lecb_dec_ret
2393 mov 480(KEYP), KLEN
2394 add $240, KEYP
2395 cmp $16, LEN
2396 jb .Lecb_dec_ret
2397 cmp $64, LEN
2398 jb .Lecb_dec_loop1
2399.align 4
2400.Lecb_dec_loop4:
2401 movups (INP), STATE1
2402 movups 0x10(INP), STATE2
2403 movups 0x20(INP), STATE3
2404 movups 0x30(INP), STATE4
2405 call _aesni_dec4
2406 movups STATE1, (OUTP)
2407 movups STATE2, 0x10(OUTP)
2408 movups STATE3, 0x20(OUTP)
2409 movups STATE4, 0x30(OUTP)
2410 sub $64, LEN
2411 add $64, INP
2412 add $64, OUTP
2413 cmp $64, LEN
2414 jge .Lecb_dec_loop4
2415 cmp $16, LEN
2416 jb .Lecb_dec_ret
2417.align 4
2418.Lecb_dec_loop1:
2419 movups (INP), STATE1
2420 call _aesni_dec1
2421 movups STATE1, (OUTP)
2422 sub $16, LEN
2423 add $16, INP
2424 add $16, OUTP
2425 cmp $16, LEN
2426 jge .Lecb_dec_loop1
2427.Lecb_dec_ret:
0d258efb
MK
2428#ifndef __x86_64__
2429 popl KLEN
2430 popl KEYP
2431 popl LEN
2432#endif
8691ccd7 2433 FRAME_END
f94909ce 2434 RET
6dcc5627 2435SYM_FUNC_END(aesni_ecb_dec)
54b6a1bd
HY
2436
2437/*
2438 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2439 * size_t len, u8 *iv)
2440 */
6dcc5627 2441SYM_FUNC_START(aesni_cbc_enc)
8691ccd7 2442 FRAME_BEGIN
0d258efb
MK
2443#ifndef __x86_64__
2444 pushl IVP
2445 pushl LEN
2446 pushl KEYP
2447 pushl KLEN
8691ccd7
JP
2448 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2449 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2450 movl (FRAME_OFFSET+28)(%esp), INP # src
2451 movl (FRAME_OFFSET+32)(%esp), LEN # len
2452 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2453#endif
54b6a1bd
HY
2454 cmp $16, LEN
2455 jb .Lcbc_enc_ret
2456 mov 480(KEYP), KLEN
2457 movups (IVP), STATE # load iv as initial state
2458.align 4
2459.Lcbc_enc_loop:
2460 movups (INP), IN # load input
2461 pxor IN, STATE
2462 call _aesni_enc1
2463 movups STATE, (OUTP) # store output
2464 sub $16, LEN
2465 add $16, INP
2466 add $16, OUTP
2467 cmp $16, LEN
2468 jge .Lcbc_enc_loop
2469 movups STATE, (IVP)
2470.Lcbc_enc_ret:
0d258efb
MK
2471#ifndef __x86_64__
2472 popl KLEN
2473 popl KEYP
2474 popl LEN
2475 popl IVP
2476#endif
8691ccd7 2477 FRAME_END
f94909ce 2478 RET
6dcc5627 2479SYM_FUNC_END(aesni_cbc_enc)
54b6a1bd
HY
2480
2481/*
2482 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2483 * size_t len, u8 *iv)
2484 */
6dcc5627 2485SYM_FUNC_START(aesni_cbc_dec)
8691ccd7 2486 FRAME_BEGIN
0d258efb
MK
2487#ifndef __x86_64__
2488 pushl IVP
2489 pushl LEN
2490 pushl KEYP
2491 pushl KLEN
8691ccd7
JP
2492 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2493 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2494 movl (FRAME_OFFSET+28)(%esp), INP # src
2495 movl (FRAME_OFFSET+32)(%esp), LEN # len
2496 movl (FRAME_OFFSET+36)(%esp), IVP # iv
0d258efb 2497#endif
54b6a1bd 2498 cmp $16, LEN
e6efaa02 2499 jb .Lcbc_dec_just_ret
54b6a1bd
HY
2500 mov 480(KEYP), KLEN
2501 add $240, KEYP
2502 movups (IVP), IV
2503 cmp $64, LEN
2504 jb .Lcbc_dec_loop1
2505.align 4
2506.Lcbc_dec_loop4:
2507 movups (INP), IN1
2508 movaps IN1, STATE1
2509 movups 0x10(INP), IN2
2510 movaps IN2, STATE2
0d258efb 2511#ifdef __x86_64__
54b6a1bd
HY
2512 movups 0x20(INP), IN3
2513 movaps IN3, STATE3
2514 movups 0x30(INP), IN4
2515 movaps IN4, STATE4
0d258efb
MK
2516#else
2517 movups 0x20(INP), IN1
2518 movaps IN1, STATE3
2519 movups 0x30(INP), IN2
2520 movaps IN2, STATE4
2521#endif
54b6a1bd
HY
2522 call _aesni_dec4
2523 pxor IV, STATE1
0d258efb 2524#ifdef __x86_64__
54b6a1bd
HY
2525 pxor IN1, STATE2
2526 pxor IN2, STATE3
2527 pxor IN3, STATE4
2528 movaps IN4, IV
0d258efb 2529#else
0d258efb
MK
2530 pxor IN1, STATE4
2531 movaps IN2, IV
7c8d5184
MK
2532 movups (INP), IN1
2533 pxor IN1, STATE2
2534 movups 0x10(INP), IN2
2535 pxor IN2, STATE3
0d258efb 2536#endif
54b6a1bd
HY
2537 movups STATE1, (OUTP)
2538 movups STATE2, 0x10(OUTP)
2539 movups STATE3, 0x20(OUTP)
2540 movups STATE4, 0x30(OUTP)
2541 sub $64, LEN
2542 add $64, INP
2543 add $64, OUTP
2544 cmp $64, LEN
2545 jge .Lcbc_dec_loop4
2546 cmp $16, LEN
2547 jb .Lcbc_dec_ret
2548.align 4
2549.Lcbc_dec_loop1:
2550 movups (INP), IN
2551 movaps IN, STATE
2552 call _aesni_dec1
2553 pxor IV, STATE
2554 movups STATE, (OUTP)
2555 movaps IN, IV
2556 sub $16, LEN
2557 add $16, INP
2558 add $16, OUTP
2559 cmp $16, LEN
2560 jge .Lcbc_dec_loop1
54b6a1bd 2561.Lcbc_dec_ret:
e6efaa02
HY
2562 movups IV, (IVP)
2563.Lcbc_dec_just_ret:
0d258efb
MK
2564#ifndef __x86_64__
2565 popl KLEN
2566 popl KEYP
2567 popl LEN
2568 popl IVP
2569#endif
8691ccd7 2570 FRAME_END
f94909ce 2571 RET
6dcc5627 2572SYM_FUNC_END(aesni_cbc_dec)
12387a46 2573
ddf169a9
AB
2574/*
2575 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2576 * size_t len, u8 *iv)
2577 */
2578SYM_FUNC_START(aesni_cts_cbc_enc)
2579 FRAME_BEGIN
2580#ifndef __x86_64__
2581 pushl IVP
2582 pushl LEN
2583 pushl KEYP
2584 pushl KLEN
2585 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2586 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2587 movl (FRAME_OFFSET+28)(%esp), INP # src
2588 movl (FRAME_OFFSET+32)(%esp), LEN # len
2589 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2590 lea .Lcts_permute_table, T1
2591#else
2592 lea .Lcts_permute_table(%rip), T1
2593#endif
2594 mov 480(KEYP), KLEN
2595 movups (IVP), STATE
2596 sub $16, LEN
2597 mov T1, IVP
2598 add $32, IVP
2599 add LEN, T1
2600 sub LEN, IVP
2601 movups (T1), %xmm4
2602 movups (IVP), %xmm5
2603
2604 movups (INP), IN1
2605 add LEN, INP
2606 movups (INP), IN2
2607
2608 pxor IN1, STATE
2609 call _aesni_enc1
2610
2611 pshufb %xmm5, IN2
2612 pxor STATE, IN2
2613 pshufb %xmm4, STATE
2614 add OUTP, LEN
2615 movups STATE, (LEN)
2616
2617 movaps IN2, STATE
2618 call _aesni_enc1
2619 movups STATE, (OUTP)
2620
2621#ifndef __x86_64__
2622 popl KLEN
2623 popl KEYP
2624 popl LEN
2625 popl IVP
2626#endif
2627 FRAME_END
f94909ce 2628 RET
ddf169a9
AB
2629SYM_FUNC_END(aesni_cts_cbc_enc)
2630
2631/*
2632 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2633 * size_t len, u8 *iv)
2634 */
2635SYM_FUNC_START(aesni_cts_cbc_dec)
2636 FRAME_BEGIN
2637#ifndef __x86_64__
2638 pushl IVP
2639 pushl LEN
2640 pushl KEYP
2641 pushl KLEN
2642 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2643 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2644 movl (FRAME_OFFSET+28)(%esp), INP # src
2645 movl (FRAME_OFFSET+32)(%esp), LEN # len
2646 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2647 lea .Lcts_permute_table, T1
2648#else
2649 lea .Lcts_permute_table(%rip), T1
2650#endif
2651 mov 480(KEYP), KLEN
2652 add $240, KEYP
2653 movups (IVP), IV
2654 sub $16, LEN
2655 mov T1, IVP
2656 add $32, IVP
2657 add LEN, T1
2658 sub LEN, IVP
2659 movups (T1), %xmm4
2660
2661 movups (INP), STATE
2662 add LEN, INP
2663 movups (INP), IN1
2664
2665 call _aesni_dec1
2666 movaps STATE, IN2
2667 pshufb %xmm4, STATE
2668 pxor IN1, STATE
2669
2670 add OUTP, LEN
2671 movups STATE, (LEN)
2672
2673 movups (IVP), %xmm0
2674 pshufb %xmm0, IN1
2675 pblendvb IN2, IN1
2676 movaps IN1, STATE
2677 call _aesni_dec1
2678
2679 pxor IV, STATE
2680 movups STATE, (OUTP)
2681
2682#ifndef __x86_64__
2683 popl KLEN
2684 popl KEYP
2685 popl LEN
2686 popl IVP
2687#endif
2688 FRAME_END
f94909ce 2689 RET
ddf169a9
AB
2690SYM_FUNC_END(aesni_cts_cbc_dec)
2691
1253cab8 2692.pushsection .rodata
12387a46 2693.align 16
ddf169a9
AB
2694.Lcts_permute_table:
2695 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2696 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2697 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2698 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2699 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2700 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2701#ifdef __x86_64__
12387a46
HY
2702.Lbswap_mask:
2703 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
ddf169a9 2704#endif
1253cab8 2705.popsection
12387a46 2706
ddf169a9 2707#ifdef __x86_64__
12387a46
HY
2708/*
2709 * _aesni_inc_init: internal ABI
2710 * setup registers used by _aesni_inc
2711 * input:
2712 * IV
2713 * output:
2714 * CTR: == IV, in little endian
2715 * TCTR_LOW: == lower qword of CTR
2716 * INC: == 1, in little endian
2717 * BSWAP_MASK == endian swapping mask
2718 */
74d8b90a 2719SYM_FUNC_START_LOCAL(_aesni_inc_init)
c75962f1 2720 movaps .Lbswap_mask(%rip), BSWAP_MASK
12387a46 2721 movaps IV, CTR
d7866e50 2722 pshufb BSWAP_MASK, CTR
12387a46 2723 mov $1, TCTR_LOW
d7866e50
UB
2724 movq TCTR_LOW, INC
2725 movq CTR, TCTR_LOW
f94909ce 2726 RET
74d8b90a 2727SYM_FUNC_END(_aesni_inc_init)
12387a46
HY
2728
2729/*
2730 * _aesni_inc: internal ABI
2731 * Increase IV by 1, IV is in big endian
2732 * input:
2733 * IV
2734 * CTR: == IV, in little endian
2735 * TCTR_LOW: == lower qword of CTR
2736 * INC: == 1, in little endian
2737 * BSWAP_MASK == endian swapping mask
2738 * output:
2739 * IV: Increase by 1
2740 * changed:
2741 * CTR: == output IV, in little endian
2742 * TCTR_LOW: == lower qword of CTR
2743 */
74d8b90a 2744SYM_FUNC_START_LOCAL(_aesni_inc)
12387a46
HY
2745 paddq INC, CTR
2746 add $1, TCTR_LOW
2747 jnc .Linc_low
2748 pslldq $8, INC
2749 paddq INC, CTR
2750 psrldq $8, INC
2751.Linc_low:
2752 movaps CTR, IV
d7866e50 2753 pshufb BSWAP_MASK, IV
f94909ce 2754 RET
74d8b90a 2755SYM_FUNC_END(_aesni_inc)
12387a46
HY
2756
2757/*
2758 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2759 * size_t len, u8 *iv)
2760 */
6dcc5627 2761SYM_FUNC_START(aesni_ctr_enc)
8691ccd7 2762 FRAME_BEGIN
12387a46
HY
2763 cmp $16, LEN
2764 jb .Lctr_enc_just_ret
2765 mov 480(KEYP), KLEN
2766 movups (IVP), IV
2767 call _aesni_inc_init
2768 cmp $64, LEN
2769 jb .Lctr_enc_loop1
2770.align 4
2771.Lctr_enc_loop4:
2772 movaps IV, STATE1
2773 call _aesni_inc
2774 movups (INP), IN1
2775 movaps IV, STATE2
2776 call _aesni_inc
2777 movups 0x10(INP), IN2
2778 movaps IV, STATE3
2779 call _aesni_inc
2780 movups 0x20(INP), IN3
2781 movaps IV, STATE4
2782 call _aesni_inc
2783 movups 0x30(INP), IN4
2784 call _aesni_enc4
2785 pxor IN1, STATE1
2786 movups STATE1, (OUTP)
2787 pxor IN2, STATE2
2788 movups STATE2, 0x10(OUTP)
2789 pxor IN3, STATE3
2790 movups STATE3, 0x20(OUTP)
2791 pxor IN4, STATE4
2792 movups STATE4, 0x30(OUTP)
2793 sub $64, LEN
2794 add $64, INP
2795 add $64, OUTP
2796 cmp $64, LEN
2797 jge .Lctr_enc_loop4
2798 cmp $16, LEN
2799 jb .Lctr_enc_ret
2800.align 4
2801.Lctr_enc_loop1:
2802 movaps IV, STATE
2803 call _aesni_inc
2804 movups (INP), IN
2805 call _aesni_enc1
2806 pxor IN, STATE
2807 movups STATE, (OUTP)
2808 sub $16, LEN
2809 add $16, INP
2810 add $16, OUTP
2811 cmp $16, LEN
2812 jge .Lctr_enc_loop1
2813.Lctr_enc_ret:
2814 movups IV, (IVP)
2815.Lctr_enc_just_ret:
8691ccd7 2816 FRAME_END
f94909ce 2817 RET
6dcc5627 2818SYM_FUNC_END(aesni_ctr_enc)
c456a9cd 2819
2481104f
AB
2820#endif
2821
2822.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2823.align 16
2824.Lgf128mul_x_ble_mask:
2825 .octa 0x00000000000000010000000000000087
2826.previous
2827
c456a9cd
JK
2828/*
2829 * _aesni_gf128mul_x_ble: internal ABI
2830 * Multiply in GF(2^128) for XTS IVs
2831 * input:
2832 * IV: current IV
2833 * GF128MUL_MASK == mask with 0x87 and 0x01
2834 * output:
2835 * IV: next IV
2836 * changed:
2837 * CTR: == temporary value
2838 */
2839#define _aesni_gf128mul_x_ble() \
2481104f 2840 pshufd $0x13, IV, KEY; \
c456a9cd 2841 paddq IV, IV; \
2481104f
AB
2842 psrad $31, KEY; \
2843 pand GF128MUL_MASK, KEY; \
2844 pxor KEY, IV;
c456a9cd
JK
2845
2846/*
86ad60a6
AB
2847 * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2848 * const u8 *src, unsigned int len, le128 *iv)
c456a9cd 2849 */
86ad60a6 2850SYM_FUNC_START(aesni_xts_encrypt)
8691ccd7 2851 FRAME_BEGIN
2481104f
AB
2852#ifndef __x86_64__
2853 pushl IVP
2854 pushl LEN
2855 pushl KEYP
2856 pushl KLEN
2857 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2858 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2859 movl (FRAME_OFFSET+28)(%esp), INP # src
2860 movl (FRAME_OFFSET+32)(%esp), LEN # len
2861 movl (FRAME_OFFSET+36)(%esp), IVP # iv
c456a9cd 2862 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2481104f
AB
2863#else
2864 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2865#endif
c456a9cd
JK
2866 movups (IVP), IV
2867
2868 mov 480(KEYP), KLEN
c456a9cd 2869
86ad60a6 2870.Lxts_enc_loop4:
2481104f
AB
2871 sub $64, LEN
2872 jl .Lxts_enc_1x
2873
c456a9cd 2874 movdqa IV, STATE1
2481104f
AB
2875 movdqu 0x00(INP), IN
2876 pxor IN, STATE1
c456a9cd
JK
2877 movdqu IV, 0x00(OUTP)
2878
2879 _aesni_gf128mul_x_ble()
2880 movdqa IV, STATE2
2481104f
AB
2881 movdqu 0x10(INP), IN
2882 pxor IN, STATE2
c456a9cd
JK
2883 movdqu IV, 0x10(OUTP)
2884
2885 _aesni_gf128mul_x_ble()
2886 movdqa IV, STATE3
2481104f
AB
2887 movdqu 0x20(INP), IN
2888 pxor IN, STATE3
c456a9cd
JK
2889 movdqu IV, 0x20(OUTP)
2890
2891 _aesni_gf128mul_x_ble()
2892 movdqa IV, STATE4
2481104f
AB
2893 movdqu 0x30(INP), IN
2894 pxor IN, STATE4
c456a9cd
JK
2895 movdqu IV, 0x30(OUTP)
2896
86ad60a6 2897 call _aesni_enc4
c456a9cd 2898
2481104f
AB
2899 movdqu 0x00(OUTP), IN
2900 pxor IN, STATE1
c456a9cd
JK
2901 movdqu STATE1, 0x00(OUTP)
2902
2481104f
AB
2903 movdqu 0x10(OUTP), IN
2904 pxor IN, STATE2
c456a9cd
JK
2905 movdqu STATE2, 0x10(OUTP)
2906
2481104f
AB
2907 movdqu 0x20(OUTP), IN
2908 pxor IN, STATE3
c456a9cd
JK
2909 movdqu STATE3, 0x20(OUTP)
2910
2481104f
AB
2911 movdqu 0x30(OUTP), IN
2912 pxor IN, STATE4
c456a9cd
JK
2913 movdqu STATE4, 0x30(OUTP)
2914
2915 _aesni_gf128mul_x_ble()
c456a9cd 2916
86ad60a6
AB
2917 add $64, INP
2918 add $64, OUTP
2481104f
AB
2919 test LEN, LEN
2920 jnz .Lxts_enc_loop4
86ad60a6 2921
2481104f 2922.Lxts_enc_ret_iv:
c456a9cd
JK
2923 movups IV, (IVP)
2924
2481104f
AB
2925.Lxts_enc_ret:
2926#ifndef __x86_64__
2927 popl KLEN
2928 popl KEYP
2929 popl LEN
2930 popl IVP
2931#endif
86ad60a6 2932 FRAME_END
f94909ce 2933 RET
2481104f
AB
2934
2935.Lxts_enc_1x:
2936 add $64, LEN
2937 jz .Lxts_enc_ret_iv
2938 sub $16, LEN
2939 jl .Lxts_enc_cts4
2940
2941.Lxts_enc_loop1:
2942 movdqu (INP), STATE
2943 pxor IV, STATE
2944 call _aesni_enc1
2945 pxor IV, STATE
2946 _aesni_gf128mul_x_ble()
2947
2948 test LEN, LEN
2949 jz .Lxts_enc_out
2950
2951 add $16, INP
2952 sub $16, LEN
2953 jl .Lxts_enc_cts1
2954
2955 movdqu STATE, (OUTP)
2956 add $16, OUTP
2957 jmp .Lxts_enc_loop1
2958
2959.Lxts_enc_out:
2960 movdqu STATE, (OUTP)
2961 jmp .Lxts_enc_ret_iv
2962
2963.Lxts_enc_cts4:
2964 movdqa STATE4, STATE
2965 sub $16, OUTP
2966
2967.Lxts_enc_cts1:
2968#ifndef __x86_64__
2969 lea .Lcts_permute_table, T1
2970#else
2971 lea .Lcts_permute_table(%rip), T1
2972#endif
2973 add LEN, INP /* rewind input pointer */
2974 add $16, LEN /* # bytes in final block */
2975 movups (INP), IN1
2976
2977 mov T1, IVP
2978 add $32, IVP
2979 add LEN, T1
2980 sub LEN, IVP
2981 add OUTP, LEN
2982
2983 movups (T1), %xmm4
2984 movaps STATE, IN2
2985 pshufb %xmm4, STATE
2986 movups STATE, (LEN)
2987
2988 movups (IVP), %xmm0
2989 pshufb %xmm0, IN1
2990 pblendvb IN2, IN1
2991 movaps IN1, STATE
2992
2993 pxor IV, STATE
2994 call _aesni_enc1
2995 pxor IV, STATE
2996
2997 movups STATE, (OUTP)
2998 jmp .Lxts_enc_ret
86ad60a6
AB
2999SYM_FUNC_END(aesni_xts_encrypt)
3000
3001/*
3002 * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
3003 * const u8 *src, unsigned int len, le128 *iv)
3004 */
3005SYM_FUNC_START(aesni_xts_decrypt)
3006 FRAME_BEGIN
2481104f
AB
3007#ifndef __x86_64__
3008 pushl IVP
3009 pushl LEN
3010 pushl KEYP
3011 pushl KLEN
3012 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
3013 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
3014 movl (FRAME_OFFSET+28)(%esp), INP # src
3015 movl (FRAME_OFFSET+32)(%esp), LEN # len
3016 movl (FRAME_OFFSET+36)(%esp), IVP # iv
86ad60a6 3017 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2481104f
AB
3018#else
3019 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
3020#endif
86ad60a6
AB
3021 movups (IVP), IV
3022
3023 mov 480(KEYP), KLEN
3024 add $240, KEYP
c456a9cd 3025
2481104f
AB
3026 test $15, LEN
3027 jz .Lxts_dec_loop4
3028 sub $16, LEN
3029
86ad60a6 3030.Lxts_dec_loop4:
2481104f
AB
3031 sub $64, LEN
3032 jl .Lxts_dec_1x
3033
86ad60a6 3034 movdqa IV, STATE1
2481104f
AB
3035 movdqu 0x00(INP), IN
3036 pxor IN, STATE1
86ad60a6 3037 movdqu IV, 0x00(OUTP)
c456a9cd 3038
86ad60a6
AB
3039 _aesni_gf128mul_x_ble()
3040 movdqa IV, STATE2
2481104f
AB
3041 movdqu 0x10(INP), IN
3042 pxor IN, STATE2
86ad60a6
AB
3043 movdqu IV, 0x10(OUTP)
3044
3045 _aesni_gf128mul_x_ble()
3046 movdqa IV, STATE3
2481104f
AB
3047 movdqu 0x20(INP), IN
3048 pxor IN, STATE3
86ad60a6
AB
3049 movdqu IV, 0x20(OUTP)
3050
3051 _aesni_gf128mul_x_ble()
3052 movdqa IV, STATE4
2481104f
AB
3053 movdqu 0x30(INP), IN
3054 pxor IN, STATE4
86ad60a6
AB
3055 movdqu IV, 0x30(OUTP)
3056
3057 call _aesni_dec4
3058
2481104f
AB
3059 movdqu 0x00(OUTP), IN
3060 pxor IN, STATE1
86ad60a6
AB
3061 movdqu STATE1, 0x00(OUTP)
3062
2481104f
AB
3063 movdqu 0x10(OUTP), IN
3064 pxor IN, STATE2
86ad60a6 3065 movdqu STATE2, 0x10(OUTP)
c456a9cd 3066
2481104f
AB
3067 movdqu 0x20(OUTP), IN
3068 pxor IN, STATE3
86ad60a6 3069 movdqu STATE3, 0x20(OUTP)
c456a9cd 3070
2481104f
AB
3071 movdqu 0x30(OUTP), IN
3072 pxor IN, STATE4
86ad60a6
AB
3073 movdqu STATE4, 0x30(OUTP)
3074
3075 _aesni_gf128mul_x_ble()
3076
3077 add $64, INP
3078 add $64, OUTP
2481104f
AB
3079 test LEN, LEN
3080 jnz .Lxts_dec_loop4
86ad60a6 3081
2481104f 3082.Lxts_dec_ret_iv:
86ad60a6 3083 movups IV, (IVP)
c456a9cd 3084
2481104f
AB
3085.Lxts_dec_ret:
3086#ifndef __x86_64__
3087 popl KLEN
3088 popl KEYP
3089 popl LEN
3090 popl IVP
3091#endif
8691ccd7 3092 FRAME_END
f94909ce 3093 RET
c456a9cd 3094
2481104f
AB
3095.Lxts_dec_1x:
3096 add $64, LEN
3097 jz .Lxts_dec_ret_iv
3098
3099.Lxts_dec_loop1:
3100 movdqu (INP), STATE
3101
3102 add $16, INP
3103 sub $16, LEN
3104 jl .Lxts_dec_cts1
3105
3106 pxor IV, STATE
3107 call _aesni_dec1
3108 pxor IV, STATE
3109 _aesni_gf128mul_x_ble()
3110
3111 test LEN, LEN
3112 jz .Lxts_dec_out
3113
3114 movdqu STATE, (OUTP)
3115 add $16, OUTP
3116 jmp .Lxts_dec_loop1
3117
3118.Lxts_dec_out:
3119 movdqu STATE, (OUTP)
3120 jmp .Lxts_dec_ret_iv
3121
3122.Lxts_dec_cts1:
3123 movdqa IV, STATE4
3124 _aesni_gf128mul_x_ble()
3125
3126 pxor IV, STATE
3127 call _aesni_dec1
3128 pxor IV, STATE
3129
3130#ifndef __x86_64__
3131 lea .Lcts_permute_table, T1
3132#else
3133 lea .Lcts_permute_table(%rip), T1
0d258efb 3134#endif
2481104f
AB
3135 add LEN, INP /* rewind input pointer */
3136 add $16, LEN /* # bytes in final block */
3137 movups (INP), IN1
3138
3139 mov T1, IVP
3140 add $32, IVP
3141 add LEN, T1
3142 sub LEN, IVP
3143 add OUTP, LEN
3144
3145 movups (T1), %xmm4
3146 movaps STATE, IN2
3147 pshufb %xmm4, STATE
3148 movups STATE, (LEN)
3149
3150 movups (IVP), %xmm0
3151 pshufb %xmm0, IN1
3152 pblendvb IN2, IN1
3153 movaps IN1, STATE
3154
3155 pxor STATE4, STATE
3156 call _aesni_dec1
3157 pxor STATE4, STATE
3158
3159 movups STATE, (OUTP)
3160 jmp .Lxts_dec_ret
3161SYM_FUNC_END(aesni_xts_decrypt)