1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
124 # constants in mergeable sections, linker can reorder and merge
125 .section .rodata.cst16.POLY, "aM", @progbits, 16
127 POLY: .octa 0xC2000000000000000000000000000001
129 .section .rodata.cst16.POLY2, "aM", @progbits, 16
131 POLY2: .octa 0xC20000000000000000000001C2000000
133 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
135 TWOONE: .octa 0x00000001000000000000000000000001
137 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
139 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
141 .section .rodata.cst16.ONE, "aM", @progbits, 16
143 ONE: .octa 0x00000000000000000000000000000001
145 .section .rodata.cst16.ONEf, "aM", @progbits, 16
147 ONEf: .octa 0x01000000000000000000000000000000
149 # order of these constants should not change.
150 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
151 .section .rodata, "a", @progbits
153 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
154 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
155 .octa 0x00000000000000000000000000000000
162 #define InLen (16*1)+8
163 #define PBlockEncKey 16*2
165 #define CurCount 16*4
166 #define PBlockLen 16*5
168 HashKey = 16*6 # store HashKey <<1 mod poly here
169 HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
170 HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
171 HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
172 HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
173 HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
174 HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
175 HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
176 HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
177 HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
178 HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
179 HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
180 HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
181 HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
182 HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
183 HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
191 #define keysize 2*15*16(arg1)
201 .macro define_reg r n
212 TMP1 = 16*0 # Temporary storage for AAD
213 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
214 TMP3 = 16*2 # Temporary storage for AES State 3
215 TMP4 = 16*3 # Temporary storage for AES State 4
216 TMP5 = 16*4 # Temporary storage for AES State 5
217 TMP6 = 16*5 # Temporary storage for AES State 6
218 TMP7 = 16*6 # Temporary storage for AES State 7
219 TMP8 = 16*7 # Temporary storage for AES State 8
221 VARIABLE_OFFSET = 16*8
223 ################################
225 ################################
235 sub $VARIABLE_OFFSET, %rsp
236 and $~63, %rsp # align rsp to 64 bytes
248 # Encryption of a single block
249 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
250 vpxor (arg1), \XMM0, \XMM0
254 vaesenc 16*i(arg1), \XMM0, \XMM0
258 vaesenclast 16*i(arg1), \XMM0, \XMM0
261 # combined for GCM encrypt and decrypt functions
262 # clobbering all xmm registers
263 # clobbering r10, r11, r12, r13, r15, rax
264 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
265 vmovdqu AadHash(arg2), %xmm8
266 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
267 add arg5, InLen(arg2)
269 # initialize the data pointer offset as zero
272 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
275 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
276 and $-16, %r13 # r13 = r13 - (r13 mod 16)
281 jz .L_initial_num_blocks_is_0\@
284 je .L_initial_num_blocks_is_7\@
286 je .L_initial_num_blocks_is_6\@
288 je .L_initial_num_blocks_is_5\@
290 je .L_initial_num_blocks_is_4\@
292 je .L_initial_num_blocks_is_3\@
294 je .L_initial_num_blocks_is_2\@
296 jmp .L_initial_num_blocks_is_1\@
298 .L_initial_num_blocks_is_7\@:
299 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
301 jmp .L_initial_blocks_encrypted\@
303 .L_initial_num_blocks_is_6\@:
304 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
306 jmp .L_initial_blocks_encrypted\@
308 .L_initial_num_blocks_is_5\@:
309 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
311 jmp .L_initial_blocks_encrypted\@
313 .L_initial_num_blocks_is_4\@:
314 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
316 jmp .L_initial_blocks_encrypted\@
318 .L_initial_num_blocks_is_3\@:
319 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
321 jmp .L_initial_blocks_encrypted\@
323 .L_initial_num_blocks_is_2\@:
324 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
326 jmp .L_initial_blocks_encrypted\@
328 .L_initial_num_blocks_is_1\@:
329 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
331 jmp .L_initial_blocks_encrypted\@
333 .L_initial_num_blocks_is_0\@:
334 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
337 .L_initial_blocks_encrypted\@:
339 je .L_zero_cipher_left\@
342 je .L_eight_cipher_left\@
349 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
352 .L_encrypt_by_8_new\@:
359 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
362 jne .L_encrypt_by_8_new\@
364 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
365 jmp .L_eight_cipher_left\@
368 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
370 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
371 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
374 jne .L_encrypt_by_8_new\@
376 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
381 .L_eight_cipher_left\@:
382 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
385 .L_zero_cipher_left\@:
386 vmovdqu %xmm14, AadHash(arg2)
387 vmovdqu %xmm9, CurCount(arg2)
391 and $15, %r13 # r13 = (arg5 mod 16)
393 je .L_multiple_of_16_bytes\@
395 # handle the last <16 Byte block separately
397 mov %r13, PBlockLen(arg2)
399 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
400 vmovdqu %xmm9, CurCount(arg2)
401 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
403 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
404 vmovdqu %xmm9, PBlockEncKey(arg2)
407 jge .L_large_enough_update\@
409 lea (arg4,%r11,1), %r10
412 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
414 lea SHIFT_MASK+16(%rip), %r12
415 sub %r13, %r12 # adjust the shuffle mask pointer to be
416 # able to shift 16-r13 bytes (r13 is the
417 # number of bytes in plaintext mod 16)
419 jmp .L_final_ghash_mul\@
421 .L_large_enough_update\@:
425 # receive the last <16 Byte block
426 vmovdqu (arg4, %r11, 1), %xmm1
431 lea SHIFT_MASK+16(%rip), %r12
432 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
433 # (r13 is the number of bytes in plaintext mod 16)
435 # get the appropriate shuffle mask
436 vmovdqu (%r12), %xmm2
437 # shift right 16-r13 bytes
438 vpshufb %xmm2, %xmm1, %xmm1
440 .L_final_ghash_mul\@:
443 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
444 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
445 # mask out top 16-r13 bytes of xmm9
446 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
447 vpand %xmm1, %xmm2, %xmm2
448 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
449 vpxor %xmm2, %xmm14, %xmm14
451 vmovdqu %xmm14, AadHash(arg2)
453 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
454 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
455 # mask out top 16-r13 bytes of xmm9
456 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
457 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
458 vpxor %xmm9, %xmm14, %xmm14
460 vmovdqu %xmm14, AadHash(arg2)
461 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
465 #############################
469 jle .L_less_than_8_bytes_left\@
471 mov %rax, (arg3 , %r11)
473 vpsrldq $8, %xmm9, %xmm9
477 .L_less_than_8_bytes_left\@:
478 movb %al, (arg3 , %r11)
482 jne .L_less_than_8_bytes_left\@
483 #############################
485 .L_multiple_of_16_bytes\@:
489 # GCM_COMPLETE Finishes update of tag of last partial block
490 # Output: Authorization Tag (AUTH_TAG)
491 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
492 .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
493 vmovdqu AadHash(arg2), %xmm14
494 vmovdqu HashKey(arg2), %xmm13
496 mov PBlockLen(arg2), %r12
500 #GHASH computation for the last <16 Byte block
501 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
504 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
505 shl $3, %r12 # convert into number of bits
506 vmovd %r12d, %xmm15 # len(A) in xmm15
508 mov InLen(arg2), %r12
509 shl $3, %r12 # len(C) in bits (*128)
511 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
512 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
514 vpxor %xmm15, %xmm14, %xmm14
515 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
516 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
518 vmovdqu OrigIV(arg2), %xmm9
520 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
522 vpxor %xmm14, %xmm9, %xmm9
527 mov \AUTH_TAG, %r10 # r10 = authTag
528 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
541 vpsrldq $8, %xmm9, %xmm9
543 je .L_return_T_done\@
549 vpsrldq $4, %xmm9, %xmm9
551 je .L_return_T_done\@
558 je .L_return_T_done\@
563 jmp .L_return_T_done\@
566 vmovdqu %xmm9, (%r10)
571 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
573 mov \AAD, %r10 # r10 = AAD
574 mov \AADLEN, %r12 # r12 = aadLen
582 jl .L_get_AAD_rest8\@
585 vpshufb SHUF_MASK(%rip), \T7, \T7
587 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
592 jge .L_get_AAD_blocks\@
599 /* read the last <16B of AAD. since we have at least 4B of
600 data right after the AAD (the ICV, and maybe some CT), we can
601 read 4B/8B blocks safely, and then get rid of the extra stuff */
604 jle .L_get_AAD_rest4\@
611 jmp .L_get_AAD_rest8\@
614 jle .L_get_AAD_rest0\@
619 vpslldq $12, \T1, \T1
623 /* finalize: shift out the extra bytes we read, and align
624 left. since pslldq can only shift by an immediate, we use
625 vpshufb and a pair of shuffle masks */
626 leaq ALL_F(%rip), %r11
628 vmovdqu 16(%r11), \T1
630 vpshufb (%r11), \T7, \T7
632 .L_get_AAD_rest_final\@:
633 vpshufb SHUF_MASK(%rip), \T7, \T7
635 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
638 vmovdqu \T7, AadHash(arg2)
641 .macro INIT GHASH_MUL PRECOMPUTE
643 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
645 mov %r11, InLen(arg2) # ctx_data.in_length = 0
647 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
648 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
651 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
653 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
654 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
656 vmovdqu (arg4), %xmm6 # xmm6 = HashKey
658 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
659 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
661 vpsllq $1, %xmm6, %xmm6
662 vpsrlq $63, %xmm2, %xmm2
664 vpslldq $8, %xmm2, %xmm2
665 vpsrldq $8, %xmm1, %xmm1
666 vpor %xmm2, %xmm6, %xmm6
668 vpshufd $0b00100100, %xmm1, %xmm2
669 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
670 vpand POLY(%rip), %xmm2, %xmm2
671 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
672 #######################################################################
673 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
675 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
677 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
681 # Reads DLEN bytes starting at DPTR and stores in XMMDst
682 # where 0 < DLEN < 16
683 # Clobbers %rax, DLEN
684 .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
685 vpxor \XMMDst, \XMMDst, \XMMDst
690 vpinsrq $0, %rax, \XMMDst, \XMMDst
692 jz .L_done_read_partial_block_\@
694 .L_read_next_byte_\@:
696 mov 7(\DPTR, \DLEN, 1), %al
698 jnz .L_read_next_byte_\@
699 vpinsrq $1, %rax, \XMMDst, \XMMDst
700 jmp .L_done_read_partial_block_\@
703 .L_read_next_byte_lt8_\@:
705 mov -1(\DPTR, \DLEN, 1), %al
707 jnz .L_read_next_byte_lt8_\@
708 vpinsrq $0, %rax, \XMMDst, \XMMDst
709 .L_done_read_partial_block_\@:
712 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
713 # between update calls.
714 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
715 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
716 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
717 .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
719 mov PBlockLen(arg2), %r13
721 je .L_partial_block_done_\@ # Leave Macro if no partial blocks
722 # Read in input data without over reading
723 cmp $16, \PLAIN_CYPH_LEN
724 jl .L_fewer_than_16_bytes_\@
725 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
728 .L_fewer_than_16_bytes_\@:
729 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
730 mov \PLAIN_CYPH_LEN, %r12
731 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
733 mov PBlockLen(arg2), %r13
735 .L_data_read_\@: # Finished reading in data
737 vmovdqu PBlockEncKey(arg2), %xmm9
738 vmovdqu HashKey(arg2), %xmm13
740 lea SHIFT_MASK(%rip), %r12
742 # adjust the shuffle mask pointer to be able to shift r13 bytes
743 # r16-r13 is the number of bytes in plaintext mod 16)
745 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
746 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
750 pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn)
752 mov \PLAIN_CYPH_LEN, %r10
754 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
756 # Determine if partial block is not being filled and
757 # shift mask accordingly
758 jge .L_no_extra_mask_1_\@
760 .L_no_extra_mask_1_\@:
762 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
763 # get the appropriate mask to mask out bottom r13 bytes of xmm9
764 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
766 vpand %xmm1, %xmm3, %xmm3
767 vmovdqa SHUF_MASK(%rip), %xmm10
768 vpshufb %xmm10, %xmm3, %xmm3
769 vpshufb %xmm2, %xmm3, %xmm3
770 vpxor %xmm3, \AAD_HASH, \AAD_HASH
773 jl .L_partial_incomplete_1_\@
775 # GHASH computation for the last <16 Byte block
776 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
779 mov %rax, PBlockLen(arg2)
781 .L_partial_incomplete_1_\@:
782 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
784 vmovdqu \AAD_HASH, AadHash(arg2)
786 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
788 mov \PLAIN_CYPH_LEN, %r10
790 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
792 # Determine if partial block is not being filled and
793 # shift mask accordingly
794 jge .L_no_extra_mask_2_\@
796 .L_no_extra_mask_2_\@:
798 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
799 # get the appropriate mask to mask out bottom r13 bytes of xmm9
800 vpand %xmm1, %xmm9, %xmm9
802 vmovdqa SHUF_MASK(%rip), %xmm1
803 vpshufb %xmm1, %xmm9, %xmm9
804 vpshufb %xmm2, %xmm9, %xmm9
805 vpxor %xmm9, \AAD_HASH, \AAD_HASH
808 jl .L_partial_incomplete_2_\@
810 # GHASH computation for the last <16 Byte block
811 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
814 mov %rax, PBlockLen(arg2)
815 jmp .L_encode_done_\@
816 .L_partial_incomplete_2_\@:
817 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
819 vmovdqu \AAD_HASH, AadHash(arg2)
821 vmovdqa SHUF_MASK(%rip), %xmm10
822 # shuffle xmm9 back to output as ciphertext
823 vpshufb %xmm10, %xmm9, %xmm9
824 vpshufb %xmm2, %xmm9, %xmm9
826 # output encrypted Bytes
828 jl .L_partial_fill_\@
831 # Set r13 to be the number of bytes to write out
835 mov \PLAIN_CYPH_LEN, %r13
840 jle .L_less_than_8_bytes_left_\@
842 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
847 .L_less_than_8_bytes_left_\@:
848 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
852 jne .L_less_than_8_bytes_left_\@
853 .L_partial_block_done_\@:
854 .endm # PARTIAL_BLOCK
856 ###############################################################################
857 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
858 # Input: A and B (128-bits each, bit-reflected)
859 # Output: C = A*B*x mod poly, (i.e. >>1 )
860 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
861 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
862 ###############################################################################
863 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
865 vpshufd $0b01001110, \GH, \T2
866 vpshufd $0b01001110, \HK, \T3
867 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
868 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
870 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
871 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
872 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
874 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
876 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
877 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
879 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
881 #first phase of the reduction
882 vpslld $31, \GH, \T2 # packed right shifting << 31
883 vpslld $30, \GH, \T3 # packed right shifting shift << 30
884 vpslld $25, \GH, \T4 # packed right shifting shift << 25
886 vpxor \T3, \T2, \T2 # xor the shifted versions
889 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
891 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
892 vpxor \T2, \GH, \GH # first phase of the reduction complete
894 #second phase of the reduction
896 vpsrld $1,\GH, \T2 # packed left shifting >> 1
897 vpsrld $2,\GH, \T3 # packed left shifting >> 2
898 vpsrld $7,\GH, \T4 # packed left shifting >> 7
899 vpxor \T3, \T2, \T2 # xor the shifted versions
904 vpxor \T1, \GH, \GH # the result is in GH
909 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
911 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
914 vpshufd $0b01001110, \T5, \T1
916 vmovdqu \T1, HashKey_k(arg2)
918 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
919 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
920 vpshufd $0b01001110, \T5, \T1
922 vmovdqu \T1, HashKey_2_k(arg2)
924 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
925 vmovdqu \T5, HashKey_3(arg2)
926 vpshufd $0b01001110, \T5, \T1
928 vmovdqu \T1, HashKey_3_k(arg2)
930 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
931 vmovdqu \T5, HashKey_4(arg2)
932 vpshufd $0b01001110, \T5, \T1
934 vmovdqu \T1, HashKey_4_k(arg2)
936 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
937 vmovdqu \T5, HashKey_5(arg2)
938 vpshufd $0b01001110, \T5, \T1
940 vmovdqu \T1, HashKey_5_k(arg2)
942 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
943 vmovdqu \T5, HashKey_6(arg2)
944 vpshufd $0b01001110, \T5, \T1
946 vmovdqu \T1, HashKey_6_k(arg2)
948 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
949 vmovdqu \T5, HashKey_7(arg2)
950 vpshufd $0b01001110, \T5, \T1
952 vmovdqu \T1, HashKey_7_k(arg2)
954 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
955 vmovdqu \T5, HashKey_8(arg2)
956 vpshufd $0b01001110, \T5, \T1
958 vmovdqu \T1, HashKey_8_k(arg2)
962 ## if a = number of total plaintext bytes
964 ## num_initial_blocks = b mod 4#
965 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
966 ## r10, r11, r12, rax are clobbered
967 ## arg1, arg2, arg3, arg4 are used as pointers only, not modified
969 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
970 i = (8-\num_initial_blocks)
972 vmovdqu AadHash(arg2), reg_i
974 # start AES for num_initial_blocks blocks
975 vmovdqu CurCount(arg2), \CTR
977 i = (9-\num_initial_blocks)
979 .rep \num_initial_blocks
980 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
982 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
987 vmovdqa (arg1), \T_key
988 i = (9-\num_initial_blocks)
990 .rep \num_initial_blocks
991 vpxor \T_key, reg_i, reg_i
999 vmovdqa 16*j(arg1), \T_key
1000 i = (9-\num_initial_blocks)
1002 .rep \num_initial_blocks
1003 vaesenc \T_key, reg_i, reg_i
1012 vmovdqa 16*j(arg1), \T_key
1013 i = (9-\num_initial_blocks)
1015 .rep \num_initial_blocks
1016 vaesenclast \T_key, reg_i, reg_i
1021 i = (9-\num_initial_blocks)
1023 .rep \num_initial_blocks
1024 vmovdqu (arg4, %r11), \T1
1025 vpxor \T1, reg_i, reg_i
1026 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
1031 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1037 i = (8-\num_initial_blocks)
1038 j = (9-\num_initial_blocks)
1041 .rep \num_initial_blocks
1042 vpxor reg_i, reg_j, reg_j
1043 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1048 # XMM8 has the combined result here
1050 vmovdqa \XMM8, TMP1(%rsp)
1054 jl .L_initial_blocks_done\@ # no need for precomputed constants
1056 ###############################################################################
1057 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1058 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1060 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1062 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1064 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1066 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1068 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1070 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1072 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1074 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1076 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1078 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1080 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1082 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1084 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1086 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1088 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1090 vmovdqa (arg1), \T_key
1091 vpxor \T_key, \XMM1, \XMM1
1092 vpxor \T_key, \XMM2, \XMM2
1093 vpxor \T_key, \XMM3, \XMM3
1094 vpxor \T_key, \XMM4, \XMM4
1095 vpxor \T_key, \XMM5, \XMM5
1096 vpxor \T_key, \XMM6, \XMM6
1097 vpxor \T_key, \XMM7, \XMM7
1098 vpxor \T_key, \XMM8, \XMM8
1102 .rep \REP # do REP rounds
1103 vmovdqa 16*i(arg1), \T_key
1104 vaesenc \T_key, \XMM1, \XMM1
1105 vaesenc \T_key, \XMM2, \XMM2
1106 vaesenc \T_key, \XMM3, \XMM3
1107 vaesenc \T_key, \XMM4, \XMM4
1108 vaesenc \T_key, \XMM5, \XMM5
1109 vaesenc \T_key, \XMM6, \XMM6
1110 vaesenc \T_key, \XMM7, \XMM7
1111 vaesenc \T_key, \XMM8, \XMM8
1116 vmovdqa 16*i(arg1), \T_key
1117 vaesenclast \T_key, \XMM1, \XMM1
1118 vaesenclast \T_key, \XMM2, \XMM2
1119 vaesenclast \T_key, \XMM3, \XMM3
1120 vaesenclast \T_key, \XMM4, \XMM4
1121 vaesenclast \T_key, \XMM5, \XMM5
1122 vaesenclast \T_key, \XMM6, \XMM6
1123 vaesenclast \T_key, \XMM7, \XMM7
1124 vaesenclast \T_key, \XMM8, \XMM8
1126 vmovdqu (arg4, %r11), \T1
1127 vpxor \T1, \XMM1, \XMM1
1128 vmovdqu \XMM1, (arg3 , %r11)
1133 vmovdqu 16*1(arg4, %r11), \T1
1134 vpxor \T1, \XMM2, \XMM2
1135 vmovdqu \XMM2, 16*1(arg3 , %r11)
1140 vmovdqu 16*2(arg4, %r11), \T1
1141 vpxor \T1, \XMM3, \XMM3
1142 vmovdqu \XMM3, 16*2(arg3 , %r11)
1147 vmovdqu 16*3(arg4, %r11), \T1
1148 vpxor \T1, \XMM4, \XMM4
1149 vmovdqu \XMM4, 16*3(arg3 , %r11)
1154 vmovdqu 16*4(arg4, %r11), \T1
1155 vpxor \T1, \XMM5, \XMM5
1156 vmovdqu \XMM5, 16*4(arg3 , %r11)
1161 vmovdqu 16*5(arg4, %r11), \T1
1162 vpxor \T1, \XMM6, \XMM6
1163 vmovdqu \XMM6, 16*5(arg3 , %r11)
1168 vmovdqu 16*6(arg4, %r11), \T1
1169 vpxor \T1, \XMM7, \XMM7
1170 vmovdqu \XMM7, 16*6(arg3 , %r11)
1175 vmovdqu 16*7(arg4, %r11), \T1
1176 vpxor \T1, \XMM8, \XMM8
1177 vmovdqu \XMM8, 16*7(arg3 , %r11)
1184 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1185 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1186 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1187 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1188 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1189 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1190 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1191 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1192 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1194 ###############################################################################
1196 .L_initial_blocks_done\@:
1200 # encrypt 8 blocks at a time
1201 # ghash the 8 previously encrypted ciphertext blocks
1202 # arg1, arg2, arg3, arg4 are used as pointers only, not modified
1203 # r11 is the data offset value
1204 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1207 vmovdqa \XMM2, TMP2(%rsp)
1208 vmovdqa \XMM3, TMP3(%rsp)
1209 vmovdqa \XMM4, TMP4(%rsp)
1210 vmovdqa \XMM5, TMP5(%rsp)
1211 vmovdqa \XMM6, TMP6(%rsp)
1212 vmovdqa \XMM7, TMP7(%rsp)
1213 vmovdqa \XMM8, TMP8(%rsp)
1215 .if \loop_idx == in_order
1216 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1217 vpaddd ONE(%rip), \XMM1, \XMM2
1218 vpaddd ONE(%rip), \XMM2, \XMM3
1219 vpaddd ONE(%rip), \XMM3, \XMM4
1220 vpaddd ONE(%rip), \XMM4, \XMM5
1221 vpaddd ONE(%rip), \XMM5, \XMM6
1222 vpaddd ONE(%rip), \XMM6, \XMM7
1223 vpaddd ONE(%rip), \XMM7, \XMM8
1226 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1227 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1228 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1229 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1230 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1231 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1232 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1233 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1235 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1236 vpaddd ONEf(%rip), \XMM1, \XMM2
1237 vpaddd ONEf(%rip), \XMM2, \XMM3
1238 vpaddd ONEf(%rip), \XMM3, \XMM4
1239 vpaddd ONEf(%rip), \XMM4, \XMM5
1240 vpaddd ONEf(%rip), \XMM5, \XMM6
1241 vpaddd ONEf(%rip), \XMM6, \XMM7
1242 vpaddd ONEf(%rip), \XMM7, \XMM8
1247 #######################################################################
1250 vpxor \T1, \XMM1, \XMM1
1251 vpxor \T1, \XMM2, \XMM2
1252 vpxor \T1, \XMM3, \XMM3
1253 vpxor \T1, \XMM4, \XMM4
1254 vpxor \T1, \XMM5, \XMM5
1255 vpxor \T1, \XMM6, \XMM6
1256 vpxor \T1, \XMM7, \XMM7
1257 vpxor \T1, \XMM8, \XMM8
1259 #######################################################################
1265 vmovdqu 16*1(arg1), \T1
1266 vaesenc \T1, \XMM1, \XMM1
1267 vaesenc \T1, \XMM2, \XMM2
1268 vaesenc \T1, \XMM3, \XMM3
1269 vaesenc \T1, \XMM4, \XMM4
1270 vaesenc \T1, \XMM5, \XMM5
1271 vaesenc \T1, \XMM6, \XMM6
1272 vaesenc \T1, \XMM7, \XMM7
1273 vaesenc \T1, \XMM8, \XMM8
1275 vmovdqu 16*2(arg1), \T1
1276 vaesenc \T1, \XMM1, \XMM1
1277 vaesenc \T1, \XMM2, \XMM2
1278 vaesenc \T1, \XMM3, \XMM3
1279 vaesenc \T1, \XMM4, \XMM4
1280 vaesenc \T1, \XMM5, \XMM5
1281 vaesenc \T1, \XMM6, \XMM6
1282 vaesenc \T1, \XMM7, \XMM7
1283 vaesenc \T1, \XMM8, \XMM8
1286 #######################################################################
1288 vmovdqu HashKey_8(arg2), \T5
1289 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1290 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1292 vpshufd $0b01001110, \T2, \T6
1295 vmovdqu HashKey_8_k(arg2), \T5
1296 vpclmulqdq $0x00, \T5, \T6, \T6
1298 vmovdqu 16*3(arg1), \T1
1299 vaesenc \T1, \XMM1, \XMM1
1300 vaesenc \T1, \XMM2, \XMM2
1301 vaesenc \T1, \XMM3, \XMM3
1302 vaesenc \T1, \XMM4, \XMM4
1303 vaesenc \T1, \XMM5, \XMM5
1304 vaesenc \T1, \XMM6, \XMM6
1305 vaesenc \T1, \XMM7, \XMM7
1306 vaesenc \T1, \XMM8, \XMM8
1308 vmovdqa TMP2(%rsp), \T1
1309 vmovdqu HashKey_7(arg2), \T5
1310 vpclmulqdq $0x11, \T5, \T1, \T3
1312 vpclmulqdq $0x00, \T5, \T1, \T3
1315 vpshufd $0b01001110, \T1, \T3
1317 vmovdqu HashKey_7_k(arg2), \T5
1318 vpclmulqdq $0x10, \T5, \T3, \T3
1321 vmovdqu 16*4(arg1), \T1
1322 vaesenc \T1, \XMM1, \XMM1
1323 vaesenc \T1, \XMM2, \XMM2
1324 vaesenc \T1, \XMM3, \XMM3
1325 vaesenc \T1, \XMM4, \XMM4
1326 vaesenc \T1, \XMM5, \XMM5
1327 vaesenc \T1, \XMM6, \XMM6
1328 vaesenc \T1, \XMM7, \XMM7
1329 vaesenc \T1, \XMM8, \XMM8
1331 #######################################################################
1333 vmovdqa TMP3(%rsp), \T1
1334 vmovdqu HashKey_6(arg2), \T5
1335 vpclmulqdq $0x11, \T5, \T1, \T3
1337 vpclmulqdq $0x00, \T5, \T1, \T3
1340 vpshufd $0b01001110, \T1, \T3
1342 vmovdqu HashKey_6_k(arg2), \T5
1343 vpclmulqdq $0x10, \T5, \T3, \T3
1346 vmovdqu 16*5(arg1), \T1
1347 vaesenc \T1, \XMM1, \XMM1
1348 vaesenc \T1, \XMM2, \XMM2
1349 vaesenc \T1, \XMM3, \XMM3
1350 vaesenc \T1, \XMM4, \XMM4
1351 vaesenc \T1, \XMM5, \XMM5
1352 vaesenc \T1, \XMM6, \XMM6
1353 vaesenc \T1, \XMM7, \XMM7
1354 vaesenc \T1, \XMM8, \XMM8
1356 vmovdqa TMP4(%rsp), \T1
1357 vmovdqu HashKey_5(arg2), \T5
1358 vpclmulqdq $0x11, \T5, \T1, \T3
1360 vpclmulqdq $0x00, \T5, \T1, \T3
1363 vpshufd $0b01001110, \T1, \T3
1365 vmovdqu HashKey_5_k(arg2), \T5
1366 vpclmulqdq $0x10, \T5, \T3, \T3
1369 vmovdqu 16*6(arg1), \T1
1370 vaesenc \T1, \XMM1, \XMM1
1371 vaesenc \T1, \XMM2, \XMM2
1372 vaesenc \T1, \XMM3, \XMM3
1373 vaesenc \T1, \XMM4, \XMM4
1374 vaesenc \T1, \XMM5, \XMM5
1375 vaesenc \T1, \XMM6, \XMM6
1376 vaesenc \T1, \XMM7, \XMM7
1377 vaesenc \T1, \XMM8, \XMM8
1380 vmovdqa TMP5(%rsp), \T1
1381 vmovdqu HashKey_4(arg2), \T5
1382 vpclmulqdq $0x11, \T5, \T1, \T3
1384 vpclmulqdq $0x00, \T5, \T1, \T3
1387 vpshufd $0b01001110, \T1, \T3
1389 vmovdqu HashKey_4_k(arg2), \T5
1390 vpclmulqdq $0x10, \T5, \T3, \T3
1393 vmovdqu 16*7(arg1), \T1
1394 vaesenc \T1, \XMM1, \XMM1
1395 vaesenc \T1, \XMM2, \XMM2
1396 vaesenc \T1, \XMM3, \XMM3
1397 vaesenc \T1, \XMM4, \XMM4
1398 vaesenc \T1, \XMM5, \XMM5
1399 vaesenc \T1, \XMM6, \XMM6
1400 vaesenc \T1, \XMM7, \XMM7
1401 vaesenc \T1, \XMM8, \XMM8
1403 vmovdqa TMP6(%rsp), \T1
1404 vmovdqu HashKey_3(arg2), \T5
1405 vpclmulqdq $0x11, \T5, \T1, \T3
1407 vpclmulqdq $0x00, \T5, \T1, \T3
1410 vpshufd $0b01001110, \T1, \T3
1412 vmovdqu HashKey_3_k(arg2), \T5
1413 vpclmulqdq $0x10, \T5, \T3, \T3
1417 vmovdqu 16*8(arg1), \T1
1418 vaesenc \T1, \XMM1, \XMM1
1419 vaesenc \T1, \XMM2, \XMM2
1420 vaesenc \T1, \XMM3, \XMM3
1421 vaesenc \T1, \XMM4, \XMM4
1422 vaesenc \T1, \XMM5, \XMM5
1423 vaesenc \T1, \XMM6, \XMM6
1424 vaesenc \T1, \XMM7, \XMM7
1425 vaesenc \T1, \XMM8, \XMM8
1427 vmovdqa TMP7(%rsp), \T1
1428 vmovdqu HashKey_2(arg2), \T5
1429 vpclmulqdq $0x11, \T5, \T1, \T3
1431 vpclmulqdq $0x00, \T5, \T1, \T3
1434 vpshufd $0b01001110, \T1, \T3
1436 vmovdqu HashKey_2_k(arg2), \T5
1437 vpclmulqdq $0x10, \T5, \T3, \T3
1440 #######################################################################
1442 vmovdqu 16*9(arg1), \T5
1443 vaesenc \T5, \XMM1, \XMM1
1444 vaesenc \T5, \XMM2, \XMM2
1445 vaesenc \T5, \XMM3, \XMM3
1446 vaesenc \T5, \XMM4, \XMM4
1447 vaesenc \T5, \XMM5, \XMM5
1448 vaesenc \T5, \XMM6, \XMM6
1449 vaesenc \T5, \XMM7, \XMM7
1450 vaesenc \T5, \XMM8, \XMM8
1452 vmovdqa TMP8(%rsp), \T1
1453 vmovdqu HashKey(arg2), \T5
1454 vpclmulqdq $0x11, \T5, \T1, \T3
1456 vpclmulqdq $0x00, \T5, \T1, \T3
1459 vpshufd $0b01001110, \T1, \T3
1461 vmovdqu HashKey_k(arg2), \T5
1462 vpclmulqdq $0x10, \T5, \T3, \T3
1468 vmovdqu 16*10(arg1), \T5
1474 vaesenc \T5, \XMM1, \XMM1
1475 vaesenc \T5, \XMM2, \XMM2
1476 vaesenc \T5, \XMM3, \XMM3
1477 vaesenc \T5, \XMM4, \XMM4
1478 vaesenc \T5, \XMM5, \XMM5
1479 vaesenc \T5, \XMM6, \XMM6
1480 vaesenc \T5, \XMM7, \XMM7
1481 vaesenc \T5, \XMM8, \XMM8
1483 vmovdqu 16*i(arg1), \T5
1492 vpxor 16*i(arg4, %r11), \T5, \T2
1494 vaesenclast \T2, reg_j, reg_j
1496 vaesenclast \T2, reg_j, \T3
1497 vmovdqu 16*i(arg4, %r11), reg_j
1498 vmovdqu \T3, 16*i(arg3, %r11)
1504 #######################################################################
1507 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1508 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1510 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1514 #######################################################################
1515 #first phase of the reduction
1516 #######################################################################
1517 vpslld $31, \T7, \T2 # packed right shifting << 31
1518 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1519 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1521 vpxor \T3, \T2, \T2 # xor the shifted versions
1524 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1526 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1527 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1528 #######################################################################
1530 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1531 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1532 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1533 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1534 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1535 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1536 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1537 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1540 #######################################################################
1541 #second phase of the reduction
1542 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1543 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1544 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1545 vpxor \T3, \T2, \T2 # xor the shifted versions
1550 vpxor \T7, \T6, \T6 # the result is in T6
1551 #######################################################################
1553 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1554 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1555 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1556 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1557 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1558 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1559 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1560 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1563 vpxor \T6, \XMM1, \XMM1
1570 # GHASH the last 4 ciphertext blocks.
1571 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1576 vpshufd $0b01001110, \XMM1, \T2
1577 vpxor \XMM1, \T2, \T2
1578 vmovdqu HashKey_8(arg2), \T5
1579 vpclmulqdq $0x11, \T5, \XMM1, \T6
1580 vpclmulqdq $0x00, \T5, \XMM1, \T7
1582 vmovdqu HashKey_8_k(arg2), \T3
1583 vpclmulqdq $0x00, \T3, \T2, \XMM1
1585 ######################
1587 vpshufd $0b01001110, \XMM2, \T2
1588 vpxor \XMM2, \T2, \T2
1589 vmovdqu HashKey_7(arg2), \T5
1590 vpclmulqdq $0x11, \T5, \XMM2, \T4
1593 vpclmulqdq $0x00, \T5, \XMM2, \T4
1596 vmovdqu HashKey_7_k(arg2), \T3
1597 vpclmulqdq $0x00, \T3, \T2, \T2
1598 vpxor \T2, \XMM1, \XMM1
1600 ######################
1602 vpshufd $0b01001110, \XMM3, \T2
1603 vpxor \XMM3, \T2, \T2
1604 vmovdqu HashKey_6(arg2), \T5
1605 vpclmulqdq $0x11, \T5, \XMM3, \T4
1608 vpclmulqdq $0x00, \T5, \XMM3, \T4
1611 vmovdqu HashKey_6_k(arg2), \T3
1612 vpclmulqdq $0x00, \T3, \T2, \T2
1613 vpxor \T2, \XMM1, \XMM1
1615 ######################
1617 vpshufd $0b01001110, \XMM4, \T2
1618 vpxor \XMM4, \T2, \T2
1619 vmovdqu HashKey_5(arg2), \T5
1620 vpclmulqdq $0x11, \T5, \XMM4, \T4
1623 vpclmulqdq $0x00, \T5, \XMM4, \T4
1626 vmovdqu HashKey_5_k(arg2), \T3
1627 vpclmulqdq $0x00, \T3, \T2, \T2
1628 vpxor \T2, \XMM1, \XMM1
1630 ######################
1632 vpshufd $0b01001110, \XMM5, \T2
1633 vpxor \XMM5, \T2, \T2
1634 vmovdqu HashKey_4(arg2), \T5
1635 vpclmulqdq $0x11, \T5, \XMM5, \T4
1638 vpclmulqdq $0x00, \T5, \XMM5, \T4
1641 vmovdqu HashKey_4_k(arg2), \T3
1642 vpclmulqdq $0x00, \T3, \T2, \T2
1643 vpxor \T2, \XMM1, \XMM1
1645 ######################
1647 vpshufd $0b01001110, \XMM6, \T2
1648 vpxor \XMM6, \T2, \T2
1649 vmovdqu HashKey_3(arg2), \T5
1650 vpclmulqdq $0x11, \T5, \XMM6, \T4
1653 vpclmulqdq $0x00, \T5, \XMM6, \T4
1656 vmovdqu HashKey_3_k(arg2), \T3
1657 vpclmulqdq $0x00, \T3, \T2, \T2
1658 vpxor \T2, \XMM1, \XMM1
1660 ######################
1662 vpshufd $0b01001110, \XMM7, \T2
1663 vpxor \XMM7, \T2, \T2
1664 vmovdqu HashKey_2(arg2), \T5
1665 vpclmulqdq $0x11, \T5, \XMM7, \T4
1668 vpclmulqdq $0x00, \T5, \XMM7, \T4
1671 vmovdqu HashKey_2_k(arg2), \T3
1672 vpclmulqdq $0x00, \T3, \T2, \T2
1673 vpxor \T2, \XMM1, \XMM1
1675 ######################
1677 vpshufd $0b01001110, \XMM8, \T2
1678 vpxor \XMM8, \T2, \T2
1679 vmovdqu HashKey(arg2), \T5
1680 vpclmulqdq $0x11, \T5, \XMM8, \T4
1683 vpclmulqdq $0x00, \T5, \XMM8, \T4
1686 vmovdqu HashKey_k(arg2), \T3
1687 vpclmulqdq $0x00, \T3, \T2, \T2
1689 vpxor \T2, \XMM1, \XMM1
1690 vpxor \T6, \XMM1, \XMM1
1691 vpxor \T7, \XMM1, \T2
1696 vpslldq $8, \T2, \T4
1697 vpsrldq $8, \T2, \T2
1700 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1701 # the accumulated carry-less multiplications
1703 #######################################################################
1704 #first phase of the reduction
1705 vpslld $31, \T7, \T2 # packed right shifting << 31
1706 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1707 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1709 vpxor \T3, \T2, \T2 # xor the shifted versions
1712 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1714 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1715 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1716 #######################################################################
1719 #second phase of the reduction
1720 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1721 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1722 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1723 vpxor \T3, \T2, \T2 # xor the shifted versions
1728 vpxor \T7, \T6, \T6 # the result is in T6
1732 #############################################################
1733 #void aesni_gcm_precomp_avx_gen2
1734 # (gcm_data *my_ctx_data,
1735 # gcm_context_data *data,
1736 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1737 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1738 # (from Security Association) concatenated with 8 byte
1739 # Initialisation Vector (from IPSec ESP Payload)
1740 # concatenated with 0x00000001. 16-byte aligned pointer. */
1741 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1742 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1743 #############################################################
1744 SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1746 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1749 SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1751 ###############################################################################
1752 #void aesni_gcm_enc_update_avx_gen2(
1753 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1754 # gcm_context_data *data,
1755 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1756 # const u8 *in, /* Plaintext input */
1757 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1758 ###############################################################################
1759 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1763 je key_256_enc_update
1765 je key_128_enc_update
1767 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1771 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1775 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1778 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1780 ###############################################################################
1781 #void aesni_gcm_dec_update_avx_gen2(
1782 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1783 # gcm_context_data *data,
1784 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1785 # const u8 *in, /* Ciphertext input */
1786 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1787 ###############################################################################
1788 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1792 je key_256_dec_update
1794 je key_128_dec_update
1796 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1800 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1804 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1807 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1809 ###############################################################################
1810 #void aesni_gcm_finalize_avx_gen2(
1811 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1812 # gcm_context_data *data,
1813 # u8 *auth_tag, /* Authenticated Tag output. */
1814 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1815 # Valid values are 16 (most likely), 12 or 8. */
1816 ###############################################################################
1817 SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1825 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1829 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1833 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1836 SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1838 ###############################################################################
1839 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1840 # Input: A and B (128-bits each, bit-reflected)
1841 # Output: C = A*B*x mod poly, (i.e. >>1 )
1842 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1843 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1844 ###############################################################################
1845 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1847 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1848 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1849 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1850 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1854 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1855 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1860 #######################################################################
1861 #first phase of the reduction
1862 vmovdqa POLY2(%rip), \T3
1864 vpclmulqdq $0x01, \GH, \T3, \T2
1865 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1867 vpxor \T2, \GH, \GH # first phase of the reduction complete
1868 #######################################################################
1869 #second phase of the reduction
1870 vpclmulqdq $0x00, \GH, \T3, \T2
1871 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1873 vpclmulqdq $0x10, \GH, \T3, \GH
1874 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1876 vpxor \T2, \GH, \GH # second phase of the reduction complete
1877 #######################################################################
1878 vpxor \T1, \GH, \GH # the result is in GH
1883 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1885 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1887 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1888 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1890 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1891 vmovdqu \T5, HashKey_3(arg2)
1893 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1894 vmovdqu \T5, HashKey_4(arg2)
1896 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1897 vmovdqu \T5, HashKey_5(arg2)
1899 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1900 vmovdqu \T5, HashKey_6(arg2)
1902 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1903 vmovdqu \T5, HashKey_7(arg2)
1905 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1906 vmovdqu \T5, HashKey_8(arg2)
1910 ## if a = number of total plaintext bytes
1912 ## num_initial_blocks = b mod 4#
1913 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1914 ## r10, r11, r12, rax are clobbered
1915 ## arg1, arg2, arg3, arg4 are used as pointers only, not modified
1917 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1918 i = (8-\num_initial_blocks)
1920 vmovdqu AadHash(arg2), reg_i
1922 # start AES for num_initial_blocks blocks
1923 vmovdqu CurCount(arg2), \CTR
1925 i = (9-\num_initial_blocks)
1927 .rep \num_initial_blocks
1928 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1930 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1935 vmovdqa (arg1), \T_key
1936 i = (9-\num_initial_blocks)
1938 .rep \num_initial_blocks
1939 vpxor \T_key, reg_i, reg_i
1947 vmovdqa 16*j(arg1), \T_key
1948 i = (9-\num_initial_blocks)
1950 .rep \num_initial_blocks
1951 vaesenc \T_key, reg_i, reg_i
1961 vmovdqa 16*j(arg1), \T_key
1962 i = (9-\num_initial_blocks)
1964 .rep \num_initial_blocks
1965 vaesenclast \T_key, reg_i, reg_i
1970 i = (9-\num_initial_blocks)
1972 .rep \num_initial_blocks
1973 vmovdqu (arg4, %r11), \T1
1974 vpxor \T1, reg_i, reg_i
1975 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
1976 # num_initial_blocks blocks
1981 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1987 i = (8-\num_initial_blocks)
1988 j = (9-\num_initial_blocks)
1991 .rep \num_initial_blocks
1992 vpxor reg_i, reg_j, reg_j
1993 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1998 # XMM8 has the combined result here
2000 vmovdqa \XMM8, TMP1(%rsp)
2004 jl .L_initial_blocks_done\@ # no need for precomputed constants
2006 ###############################################################################
2007 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2008 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2010 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2012 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2014 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2016 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2018 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2020 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2022 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2024 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2026 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2028 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2030 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2032 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2034 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2036 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2038 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2040 vmovdqa (arg1), \T_key
2041 vpxor \T_key, \XMM1, \XMM1
2042 vpxor \T_key, \XMM2, \XMM2
2043 vpxor \T_key, \XMM3, \XMM3
2044 vpxor \T_key, \XMM4, \XMM4
2045 vpxor \T_key, \XMM5, \XMM5
2046 vpxor \T_key, \XMM6, \XMM6
2047 vpxor \T_key, \XMM7, \XMM7
2048 vpxor \T_key, \XMM8, \XMM8
2052 .rep \REP # do REP rounds
2053 vmovdqa 16*i(arg1), \T_key
2054 vaesenc \T_key, \XMM1, \XMM1
2055 vaesenc \T_key, \XMM2, \XMM2
2056 vaesenc \T_key, \XMM3, \XMM3
2057 vaesenc \T_key, \XMM4, \XMM4
2058 vaesenc \T_key, \XMM5, \XMM5
2059 vaesenc \T_key, \XMM6, \XMM6
2060 vaesenc \T_key, \XMM7, \XMM7
2061 vaesenc \T_key, \XMM8, \XMM8
2067 vmovdqa 16*i(arg1), \T_key
2068 vaesenclast \T_key, \XMM1, \XMM1
2069 vaesenclast \T_key, \XMM2, \XMM2
2070 vaesenclast \T_key, \XMM3, \XMM3
2071 vaesenclast \T_key, \XMM4, \XMM4
2072 vaesenclast \T_key, \XMM5, \XMM5
2073 vaesenclast \T_key, \XMM6, \XMM6
2074 vaesenclast \T_key, \XMM7, \XMM7
2075 vaesenclast \T_key, \XMM8, \XMM8
2077 vmovdqu (arg4, %r11), \T1
2078 vpxor \T1, \XMM1, \XMM1
2079 vmovdqu \XMM1, (arg3 , %r11)
2084 vmovdqu 16*1(arg4, %r11), \T1
2085 vpxor \T1, \XMM2, \XMM2
2086 vmovdqu \XMM2, 16*1(arg3 , %r11)
2091 vmovdqu 16*2(arg4, %r11), \T1
2092 vpxor \T1, \XMM3, \XMM3
2093 vmovdqu \XMM3, 16*2(arg3 , %r11)
2098 vmovdqu 16*3(arg4, %r11), \T1
2099 vpxor \T1, \XMM4, \XMM4
2100 vmovdqu \XMM4, 16*3(arg3 , %r11)
2105 vmovdqu 16*4(arg4, %r11), \T1
2106 vpxor \T1, \XMM5, \XMM5
2107 vmovdqu \XMM5, 16*4(arg3 , %r11)
2112 vmovdqu 16*5(arg4, %r11), \T1
2113 vpxor \T1, \XMM6, \XMM6
2114 vmovdqu \XMM6, 16*5(arg3 , %r11)
2119 vmovdqu 16*6(arg4, %r11), \T1
2120 vpxor \T1, \XMM7, \XMM7
2121 vmovdqu \XMM7, 16*6(arg3 , %r11)
2126 vmovdqu 16*7(arg4, %r11), \T1
2127 vpxor \T1, \XMM8, \XMM8
2128 vmovdqu \XMM8, 16*7(arg3 , %r11)
2135 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2136 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
2137 # the corresponding ciphertext
2138 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2139 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2140 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2141 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2142 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2143 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2144 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2146 ###############################################################################
2148 .L_initial_blocks_done\@:
2155 # encrypt 8 blocks at a time
2156 # ghash the 8 previously encrypted ciphertext blocks
2157 # arg1, arg2, arg3, arg4 are used as pointers only, not modified
2158 # r11 is the data offset value
2159 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2162 vmovdqa \XMM2, TMP2(%rsp)
2163 vmovdqa \XMM3, TMP3(%rsp)
2164 vmovdqa \XMM4, TMP4(%rsp)
2165 vmovdqa \XMM5, TMP5(%rsp)
2166 vmovdqa \XMM6, TMP6(%rsp)
2167 vmovdqa \XMM7, TMP7(%rsp)
2168 vmovdqa \XMM8, TMP8(%rsp)
2170 .if \loop_idx == in_order
2171 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2172 vpaddd ONE(%rip), \XMM1, \XMM2
2173 vpaddd ONE(%rip), \XMM2, \XMM3
2174 vpaddd ONE(%rip), \XMM3, \XMM4
2175 vpaddd ONE(%rip), \XMM4, \XMM5
2176 vpaddd ONE(%rip), \XMM5, \XMM6
2177 vpaddd ONE(%rip), \XMM6, \XMM7
2178 vpaddd ONE(%rip), \XMM7, \XMM8
2181 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2182 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2183 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2184 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2185 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2186 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2187 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2188 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2190 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2191 vpaddd ONEf(%rip), \XMM1, \XMM2
2192 vpaddd ONEf(%rip), \XMM2, \XMM3
2193 vpaddd ONEf(%rip), \XMM3, \XMM4
2194 vpaddd ONEf(%rip), \XMM4, \XMM5
2195 vpaddd ONEf(%rip), \XMM5, \XMM6
2196 vpaddd ONEf(%rip), \XMM6, \XMM7
2197 vpaddd ONEf(%rip), \XMM7, \XMM8
2202 #######################################################################
2205 vpxor \T1, \XMM1, \XMM1
2206 vpxor \T1, \XMM2, \XMM2
2207 vpxor \T1, \XMM3, \XMM3
2208 vpxor \T1, \XMM4, \XMM4
2209 vpxor \T1, \XMM5, \XMM5
2210 vpxor \T1, \XMM6, \XMM6
2211 vpxor \T1, \XMM7, \XMM7
2212 vpxor \T1, \XMM8, \XMM8
2214 #######################################################################
2220 vmovdqu 16*1(arg1), \T1
2221 vaesenc \T1, \XMM1, \XMM1
2222 vaesenc \T1, \XMM2, \XMM2
2223 vaesenc \T1, \XMM3, \XMM3
2224 vaesenc \T1, \XMM4, \XMM4
2225 vaesenc \T1, \XMM5, \XMM5
2226 vaesenc \T1, \XMM6, \XMM6
2227 vaesenc \T1, \XMM7, \XMM7
2228 vaesenc \T1, \XMM8, \XMM8
2230 vmovdqu 16*2(arg1), \T1
2231 vaesenc \T1, \XMM1, \XMM1
2232 vaesenc \T1, \XMM2, \XMM2
2233 vaesenc \T1, \XMM3, \XMM3
2234 vaesenc \T1, \XMM4, \XMM4
2235 vaesenc \T1, \XMM5, \XMM5
2236 vaesenc \T1, \XMM6, \XMM6
2237 vaesenc \T1, \XMM7, \XMM7
2238 vaesenc \T1, \XMM8, \XMM8
2241 #######################################################################
2243 vmovdqu HashKey_8(arg2), \T5
2244 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2245 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2246 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2247 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2250 vmovdqu 16*3(arg1), \T1
2251 vaesenc \T1, \XMM1, \XMM1
2252 vaesenc \T1, \XMM2, \XMM2
2253 vaesenc \T1, \XMM3, \XMM3
2254 vaesenc \T1, \XMM4, \XMM4
2255 vaesenc \T1, \XMM5, \XMM5
2256 vaesenc \T1, \XMM6, \XMM6
2257 vaesenc \T1, \XMM7, \XMM7
2258 vaesenc \T1, \XMM8, \XMM8
2260 vmovdqa TMP2(%rsp), \T1
2261 vmovdqu HashKey_7(arg2), \T5
2262 vpclmulqdq $0x11, \T5, \T1, \T3
2265 vpclmulqdq $0x00, \T5, \T1, \T3
2268 vpclmulqdq $0x01, \T5, \T1, \T3
2271 vpclmulqdq $0x10, \T5, \T1, \T3
2274 vmovdqu 16*4(arg1), \T1
2275 vaesenc \T1, \XMM1, \XMM1
2276 vaesenc \T1, \XMM2, \XMM2
2277 vaesenc \T1, \XMM3, \XMM3
2278 vaesenc \T1, \XMM4, \XMM4
2279 vaesenc \T1, \XMM5, \XMM5
2280 vaesenc \T1, \XMM6, \XMM6
2281 vaesenc \T1, \XMM7, \XMM7
2282 vaesenc \T1, \XMM8, \XMM8
2284 #######################################################################
2286 vmovdqa TMP3(%rsp), \T1
2287 vmovdqu HashKey_6(arg2), \T5
2288 vpclmulqdq $0x11, \T5, \T1, \T3
2291 vpclmulqdq $0x00, \T5, \T1, \T3
2294 vpclmulqdq $0x01, \T5, \T1, \T3
2297 vpclmulqdq $0x10, \T5, \T1, \T3
2300 vmovdqu 16*5(arg1), \T1
2301 vaesenc \T1, \XMM1, \XMM1
2302 vaesenc \T1, \XMM2, \XMM2
2303 vaesenc \T1, \XMM3, \XMM3
2304 vaesenc \T1, \XMM4, \XMM4
2305 vaesenc \T1, \XMM5, \XMM5
2306 vaesenc \T1, \XMM6, \XMM6
2307 vaesenc \T1, \XMM7, \XMM7
2308 vaesenc \T1, \XMM8, \XMM8
2310 vmovdqa TMP4(%rsp), \T1
2311 vmovdqu HashKey_5(arg2), \T5
2312 vpclmulqdq $0x11, \T5, \T1, \T3
2315 vpclmulqdq $0x00, \T5, \T1, \T3
2318 vpclmulqdq $0x01, \T5, \T1, \T3
2321 vpclmulqdq $0x10, \T5, \T1, \T3
2324 vmovdqu 16*6(arg1), \T1
2325 vaesenc \T1, \XMM1, \XMM1
2326 vaesenc \T1, \XMM2, \XMM2
2327 vaesenc \T1, \XMM3, \XMM3
2328 vaesenc \T1, \XMM4, \XMM4
2329 vaesenc \T1, \XMM5, \XMM5
2330 vaesenc \T1, \XMM6, \XMM6
2331 vaesenc \T1, \XMM7, \XMM7
2332 vaesenc \T1, \XMM8, \XMM8
2335 vmovdqa TMP5(%rsp), \T1
2336 vmovdqu HashKey_4(arg2), \T5
2337 vpclmulqdq $0x11, \T5, \T1, \T3
2340 vpclmulqdq $0x00, \T5, \T1, \T3
2343 vpclmulqdq $0x01, \T5, \T1, \T3
2346 vpclmulqdq $0x10, \T5, \T1, \T3
2349 vmovdqu 16*7(arg1), \T1
2350 vaesenc \T1, \XMM1, \XMM1
2351 vaesenc \T1, \XMM2, \XMM2
2352 vaesenc \T1, \XMM3, \XMM3
2353 vaesenc \T1, \XMM4, \XMM4
2354 vaesenc \T1, \XMM5, \XMM5
2355 vaesenc \T1, \XMM6, \XMM6
2356 vaesenc \T1, \XMM7, \XMM7
2357 vaesenc \T1, \XMM8, \XMM8
2359 vmovdqa TMP6(%rsp), \T1
2360 vmovdqu HashKey_3(arg2), \T5
2361 vpclmulqdq $0x11, \T5, \T1, \T3
2364 vpclmulqdq $0x00, \T5, \T1, \T3
2367 vpclmulqdq $0x01, \T5, \T1, \T3
2370 vpclmulqdq $0x10, \T5, \T1, \T3
2373 vmovdqu 16*8(arg1), \T1
2374 vaesenc \T1, \XMM1, \XMM1
2375 vaesenc \T1, \XMM2, \XMM2
2376 vaesenc \T1, \XMM3, \XMM3
2377 vaesenc \T1, \XMM4, \XMM4
2378 vaesenc \T1, \XMM5, \XMM5
2379 vaesenc \T1, \XMM6, \XMM6
2380 vaesenc \T1, \XMM7, \XMM7
2381 vaesenc \T1, \XMM8, \XMM8
2383 vmovdqa TMP7(%rsp), \T1
2384 vmovdqu HashKey_2(arg2), \T5
2385 vpclmulqdq $0x11, \T5, \T1, \T3
2388 vpclmulqdq $0x00, \T5, \T1, \T3
2391 vpclmulqdq $0x01, \T5, \T1, \T3
2394 vpclmulqdq $0x10, \T5, \T1, \T3
2398 #######################################################################
2400 vmovdqu 16*9(arg1), \T5
2401 vaesenc \T5, \XMM1, \XMM1
2402 vaesenc \T5, \XMM2, \XMM2
2403 vaesenc \T5, \XMM3, \XMM3
2404 vaesenc \T5, \XMM4, \XMM4
2405 vaesenc \T5, \XMM5, \XMM5
2406 vaesenc \T5, \XMM6, \XMM6
2407 vaesenc \T5, \XMM7, \XMM7
2408 vaesenc \T5, \XMM8, \XMM8
2410 vmovdqa TMP8(%rsp), \T1
2411 vmovdqu HashKey(arg2), \T5
2413 vpclmulqdq $0x00, \T5, \T1, \T3
2416 vpclmulqdq $0x01, \T5, \T1, \T3
2419 vpclmulqdq $0x10, \T5, \T1, \T3
2422 vpclmulqdq $0x11, \T5, \T1, \T3
2426 vmovdqu 16*10(arg1), \T5
2431 vaesenc \T5, \XMM1, \XMM1
2432 vaesenc \T5, \XMM2, \XMM2
2433 vaesenc \T5, \XMM3, \XMM3
2434 vaesenc \T5, \XMM4, \XMM4
2435 vaesenc \T5, \XMM5, \XMM5
2436 vaesenc \T5, \XMM6, \XMM6
2437 vaesenc \T5, \XMM7, \XMM7
2438 vaesenc \T5, \XMM8, \XMM8
2440 vmovdqu 16*i(arg1), \T5
2449 vpxor 16*i(arg4, %r11), \T5, \T2
2451 vaesenclast \T2, reg_j, reg_j
2453 vaesenclast \T2, reg_j, \T3
2454 vmovdqu 16*i(arg4, %r11), reg_j
2455 vmovdqu \T3, 16*i(arg3, %r11)
2461 #######################################################################
2464 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2465 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2467 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2471 #######################################################################
2472 #first phase of the reduction
2473 vmovdqa POLY2(%rip), \T3
2475 vpclmulqdq $0x01, \T7, \T3, \T2
2476 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2478 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2479 #######################################################################
2481 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2482 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2483 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2484 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2485 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2486 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2487 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2488 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2491 #######################################################################
2492 #second phase of the reduction
2493 vpclmulqdq $0x00, \T7, \T3, \T2
2494 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2496 vpclmulqdq $0x10, \T7, \T3, \T4
2497 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2499 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2500 #######################################################################
2501 vpxor \T4, \T1, \T1 # the result is in T1
2503 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2504 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2505 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2506 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2507 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2508 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2509 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2510 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2513 vpxor \T1, \XMM1, \XMM1
2520 # GHASH the last 4 ciphertext blocks.
2521 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2525 vmovdqu HashKey_8(arg2), \T5
2527 vpshufd $0b01001110, \XMM1, \T2
2528 vpshufd $0b01001110, \T5, \T3
2529 vpxor \XMM1, \T2, \T2
2532 vpclmulqdq $0x11, \T5, \XMM1, \T6
2533 vpclmulqdq $0x00, \T5, \XMM1, \T7
2535 vpclmulqdq $0x00, \T3, \T2, \XMM1
2537 ######################
2539 vmovdqu HashKey_7(arg2), \T5
2540 vpshufd $0b01001110, \XMM2, \T2
2541 vpshufd $0b01001110, \T5, \T3
2542 vpxor \XMM2, \T2, \T2
2545 vpclmulqdq $0x11, \T5, \XMM2, \T4
2548 vpclmulqdq $0x00, \T5, \XMM2, \T4
2551 vpclmulqdq $0x00, \T3, \T2, \T2
2553 vpxor \T2, \XMM1, \XMM1
2555 ######################
2557 vmovdqu HashKey_6(arg2), \T5
2558 vpshufd $0b01001110, \XMM3, \T2
2559 vpshufd $0b01001110, \T5, \T3
2560 vpxor \XMM3, \T2, \T2
2563 vpclmulqdq $0x11, \T5, \XMM3, \T4
2566 vpclmulqdq $0x00, \T5, \XMM3, \T4
2569 vpclmulqdq $0x00, \T3, \T2, \T2
2571 vpxor \T2, \XMM1, \XMM1
2573 ######################
2575 vmovdqu HashKey_5(arg2), \T5
2576 vpshufd $0b01001110, \XMM4, \T2
2577 vpshufd $0b01001110, \T5, \T3
2578 vpxor \XMM4, \T2, \T2
2581 vpclmulqdq $0x11, \T5, \XMM4, \T4
2584 vpclmulqdq $0x00, \T5, \XMM4, \T4
2587 vpclmulqdq $0x00, \T3, \T2, \T2
2589 vpxor \T2, \XMM1, \XMM1
2591 ######################
2593 vmovdqu HashKey_4(arg2), \T5
2594 vpshufd $0b01001110, \XMM5, \T2
2595 vpshufd $0b01001110, \T5, \T3
2596 vpxor \XMM5, \T2, \T2
2599 vpclmulqdq $0x11, \T5, \XMM5, \T4
2602 vpclmulqdq $0x00, \T5, \XMM5, \T4
2605 vpclmulqdq $0x00, \T3, \T2, \T2
2607 vpxor \T2, \XMM1, \XMM1
2609 ######################
2611 vmovdqu HashKey_3(arg2), \T5
2612 vpshufd $0b01001110, \XMM6, \T2
2613 vpshufd $0b01001110, \T5, \T3
2614 vpxor \XMM6, \T2, \T2
2617 vpclmulqdq $0x11, \T5, \XMM6, \T4
2620 vpclmulqdq $0x00, \T5, \XMM6, \T4
2623 vpclmulqdq $0x00, \T3, \T2, \T2
2625 vpxor \T2, \XMM1, \XMM1
2627 ######################
2629 vmovdqu HashKey_2(arg2), \T5
2630 vpshufd $0b01001110, \XMM7, \T2
2631 vpshufd $0b01001110, \T5, \T3
2632 vpxor \XMM7, \T2, \T2
2635 vpclmulqdq $0x11, \T5, \XMM7, \T4
2638 vpclmulqdq $0x00, \T5, \XMM7, \T4
2641 vpclmulqdq $0x00, \T3, \T2, \T2
2643 vpxor \T2, \XMM1, \XMM1
2645 ######################
2647 vmovdqu HashKey(arg2), \T5
2648 vpshufd $0b01001110, \XMM8, \T2
2649 vpshufd $0b01001110, \T5, \T3
2650 vpxor \XMM8, \T2, \T2
2653 vpclmulqdq $0x11, \T5, \XMM8, \T4
2656 vpclmulqdq $0x00, \T5, \XMM8, \T4
2659 vpclmulqdq $0x00, \T3, \T2, \T2
2661 vpxor \T2, \XMM1, \XMM1
2662 vpxor \T6, \XMM1, \XMM1
2663 vpxor \T7, \XMM1, \T2
2668 vpslldq $8, \T2, \T4
2669 vpsrldq $8, \T2, \T2
2672 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2673 # accumulated carry-less multiplications
2675 #######################################################################
2676 #first phase of the reduction
2677 vmovdqa POLY2(%rip), \T3
2679 vpclmulqdq $0x01, \T7, \T3, \T2
2680 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2682 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2683 #######################################################################
2686 #second phase of the reduction
2687 vpclmulqdq $0x00, \T7, \T3, \T2
2688 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2690 vpclmulqdq $0x10, \T7, \T3, \T4
2691 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2693 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2694 #######################################################################
2695 vpxor \T4, \T6, \T6 # the result is in T6
2700 #############################################################
2701 #void aesni_gcm_init_avx_gen4
2702 # (gcm_data *my_ctx_data,
2703 # gcm_context_data *data,
2704 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2705 # (from Security Association) concatenated with 8 byte
2706 # Initialisation Vector (from IPSec ESP Payload)
2707 # concatenated with 0x00000001. 16-byte aligned pointer. */
2708 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2709 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2710 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2711 #############################################################
2712 SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2714 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2717 SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2719 ###############################################################################
2720 #void aesni_gcm_enc_avx_gen4(
2721 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2722 # gcm_context_data *data,
2723 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2724 # const u8 *in, /* Plaintext input */
2725 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2726 ###############################################################################
2727 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2731 je key_256_enc_update4
2733 je key_128_enc_update4
2735 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2738 key_128_enc_update4:
2739 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2742 key_256_enc_update4:
2743 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2746 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2748 ###############################################################################
2749 #void aesni_gcm_dec_update_avx_gen4(
2750 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2751 # gcm_context_data *data,
2752 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2753 # const u8 *in, /* Ciphertext input */
2754 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2755 ###############################################################################
2756 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2760 je key_256_dec_update4
2762 je key_128_dec_update4
2764 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2767 key_128_dec_update4:
2768 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2771 key_256_dec_update4:
2772 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2775 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2777 ###############################################################################
2778 #void aesni_gcm_finalize_avx_gen4(
2779 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2780 # gcm_context_data *data,
2781 # u8 *auth_tag, /* Authenticated Tag output. */
2782 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2783 # Valid values are 16 (most likely), 12 or 8. */
2784 ###############################################################################
2785 SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2789 je key_256_finalize4
2791 je key_128_finalize4
2793 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2797 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2801 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2804 SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)