crypto: x86/cast5 - Use RIP-relative addressing
authorArd Biesheuvel <ardb@kernel.org>
Wed, 12 Apr 2023 11:00:27 +0000 (13:00 +0200)
committerHerbert Xu <herbert@gondor.apana.org.au>
Thu, 20 Apr 2023 10:20:04 +0000 (18:20 +0800)
Prefer RIP-relative addressing where possible, which removes the need
for boot time relocation fixups.

Co-developed-by: Thomas Garnier <thgarnie@chromium.org>
Signed-off-by: Thomas Garnier <thgarnie@chromium.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/x86/crypto/cast5-avx-x86_64-asm_64.S

index 0326a01503c3a554bf89482ed158eb7c8719c9ce..b4e460a87f18ddaac4ebb9cee25c86875cb87e7c 100644 (file)
 
 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
        movzbl          src ## bh,     RID1d;    \
+       leaq            s1(%rip),      RID2;     \
+       movl            (RID2,RID1,4), dst ## d; \
        movzbl          src ## bl,     RID2d;    \
+       leaq            s2(%rip),      RID1;     \
+       op1             (RID1,RID2,4), dst ## d; \
        shrq $16,       src;                     \
-       movl            s1(, RID1, 4), dst ## d; \
-       op1             s2(, RID2, 4), dst ## d; \
        movzbl          src ## bh,     RID1d;    \
+       leaq            s3(%rip),      RID2;     \
+       op2             (RID2,RID1,4), dst ## d; \
        movzbl          src ## bl,     RID2d;    \
        interleave_op(il_reg);                   \
-       op2             s3(, RID1, 4), dst ## d; \
-       op3             s4(, RID2, 4), dst ## d;
+       leaq            s4(%rip),      RID1;     \
+       op3             (RID1,RID2,4), dst ## d;
 
 #define dummy(d) /* do nothing */
 
        subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
 
 #define enc_preload_rkr() \
-       vbroadcastss    .L16_mask,                RKR;      \
+       vbroadcastss    .L16_mask(%rip),          RKR;      \
        /* add 16-bit rotation to key rotations (mod 32) */ \
        vpxor           kr(CTX),                  RKR, RKR;
 
 #define dec_preload_rkr() \
-       vbroadcastss    .L16_mask,                RKR;      \
+       vbroadcastss    .L16_mask(%rip),          RKR;      \
        /* add 16-bit rotation to key rotations (mod 32) */ \
        vpxor           kr(CTX),                  RKR, RKR; \
-       vpshufb         .Lbswap128_mask,          RKR, RKR;
+       vpshufb         .Lbswap128_mask(%rip),    RKR, RKR;
 
 #define transpose_2x4(x0, x1, t0, t1) \
        vpunpckldq              x1, x0, t0; \
@@ -235,9 +239,9 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
 
        movq %rdi, CTX;
 
-       vmovdqa .Lbswap_mask, RKM;
-       vmovd .Lfirst_mask, R1ST;
-       vmovd .L32_mask, R32;
+       vmovdqa .Lbswap_mask(%rip), RKM;
+       vmovd .Lfirst_mask(%rip), R1ST;
+       vmovd .L32_mask(%rip), R32;
        enc_preload_rkr();
 
        inpack_blocks(RL1, RR1, RTMP, RX, RKM);
@@ -271,7 +275,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
        popq %rbx;
        popq %r15;
 
-       vmovdqa .Lbswap_mask, RKM;
+       vmovdqa .Lbswap_mask(%rip), RKM;
 
        outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
        outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
@@ -308,9 +312,9 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
 
        movq %rdi, CTX;
 
-       vmovdqa .Lbswap_mask, RKM;
-       vmovd .Lfirst_mask, R1ST;
-       vmovd .L32_mask, R32;
+       vmovdqa .Lbswap_mask(%rip), RKM;
+       vmovd .Lfirst_mask(%rip), R1ST;
+       vmovd .L32_mask(%rip), R32;
        dec_preload_rkr();
 
        inpack_blocks(RL1, RR1, RTMP, RX, RKM);
@@ -341,7 +345,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
        round(RL, RR, 1, 2);
        round(RR, RL, 0, 1);
 
-       vmovdqa .Lbswap_mask, RKM;
+       vmovdqa .Lbswap_mask(%rip), RKM;
        popq %rbx;
        popq %r15;
 
@@ -504,8 +508,8 @@ SYM_FUNC_START(cast5_ctr_16way)
 
        vpcmpeqd RKR, RKR, RKR;
        vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
-       vmovdqa .Lbswap_iv_mask, R1ST;
-       vmovdqa .Lbswap128_mask, RKM;
+       vmovdqa .Lbswap_iv_mask(%rip), R1ST;
+       vmovdqa .Lbswap128_mask(%rip), RKM;
 
        /* load IV and byteswap */
        vmovq (%rcx), RX;