Commit | Line | Data |
---|---|---|
2874c5fd | 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
1c201e64 MS |
2 | /* |
3 | * Fast AES implementation for SPE instruction set (PPC) | |
4 | * | |
5 | * This code makes use of the SPE SIMD instruction set as defined in | |
6 | * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf | |
7 | * Implementation is based on optimization guide notes from | |
8 | * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf | |
9 | * | |
10 | * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> | |
1c201e64 MS |
11 | */ |
12 | ||
13 | #include <asm/ppc_asm.h> | |
14 | #include "aes-spe-regs.h" | |
15 | ||
16 | #define EAD(in, bpos) \ | |
17 | rlwimi rT0,in,28-((bpos+3)%4)*8,20,27; | |
18 | ||
19 | #define DAD(in, bpos) \ | |
20 | rlwimi rT1,in,24-((bpos+3)%4)*8,24,31; | |
21 | ||
22 | #define LWH(out, off) \ | |
23 | evlwwsplat out,off(rT0); /* load word high */ | |
24 | ||
25 | #define LWL(out, off) \ | |
26 | lwz out,off(rT0); /* load word low */ | |
27 | ||
28 | #define LBZ(out, tab, off) \ | |
29 | lbz out,off(tab); /* load byte */ | |
30 | ||
31 | #define LAH(out, in, bpos, off) \ | |
32 | EAD(in, bpos) /* calc addr + load word high */ \ | |
33 | LWH(out, off) | |
34 | ||
35 | #define LAL(out, in, bpos, off) \ | |
36 | EAD(in, bpos) /* calc addr + load word low */ \ | |
37 | LWL(out, off) | |
38 | ||
39 | #define LAE(out, in, bpos) \ | |
40 | EAD(in, bpos) /* calc addr + load enc byte */ \ | |
41 | LBZ(out, rT0, 8) | |
42 | ||
43 | #define LBE(out) \ | |
44 | LBZ(out, rT0, 8) /* load enc byte */ | |
45 | ||
46 | #define LAD(out, in, bpos) \ | |
47 | DAD(in, bpos) /* calc addr + load dec byte */ \ | |
48 | LBZ(out, rT1, 0) | |
49 | ||
50 | #define LBD(out) \ | |
51 | LBZ(out, rT1, 0) | |
52 | ||
53 | /* | |
54 | * ppc_encrypt_block: The central encryption function for a single 16 bytes | |
55 | * block. It does no stack handling or register saving to support fast calls | |
56 | * via bl/blr. It expects that caller has pre-xored input data with first | |
57 | * 4 words of encryption key into rD0-rD3. Pointer/counter registers must | |
58 | * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 | |
446957ba | 59 | * and rW0-rW3 and caller must execute a final xor on the output registers. |
1c201e64 MS |
60 | * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. |
61 | * | |
62 | */ | |
63 | _GLOBAL(ppc_encrypt_block) | |
64 | LAH(rW4, rD1, 2, 4) | |
65 | LAH(rW6, rD0, 3, 0) | |
66 | LAH(rW3, rD0, 1, 8) | |
67 | ppc_encrypt_block_loop: | |
68 | LAH(rW0, rD3, 0, 12) | |
69 | LAL(rW0, rD0, 0, 12) | |
70 | LAH(rW1, rD1, 0, 12) | |
71 | LAH(rW2, rD2, 1, 8) | |
72 | LAL(rW2, rD3, 1, 8) | |
73 | LAL(rW3, rD1, 1, 8) | |
74 | LAL(rW4, rD2, 2, 4) | |
75 | LAL(rW6, rD1, 3, 0) | |
76 | LAH(rW5, rD3, 2, 4) | |
77 | LAL(rW5, rD0, 2, 4) | |
78 | LAH(rW7, rD2, 3, 0) | |
79 | evldw rD1,16(rKP) | |
80 | EAD(rD3, 3) | |
81 | evxor rW2,rW2,rW4 | |
82 | LWL(rW7, 0) | |
83 | evxor rW2,rW2,rW6 | |
84 | EAD(rD2, 0) | |
85 | evxor rD1,rD1,rW2 | |
86 | LWL(rW1, 12) | |
87 | evxor rD1,rD1,rW0 | |
88 | evldw rD3,24(rKP) | |
89 | evmergehi rD0,rD0,rD1 | |
90 | EAD(rD1, 2) | |
91 | evxor rW3,rW3,rW5 | |
92 | LWH(rW4, 4) | |
93 | evxor rW3,rW3,rW7 | |
94 | EAD(rD0, 3) | |
95 | evxor rD3,rD3,rW3 | |
96 | LWH(rW6, 0) | |
97 | evxor rD3,rD3,rW1 | |
98 | EAD(rD0, 1) | |
99 | evmergehi rD2,rD2,rD3 | |
100 | LWH(rW3, 8) | |
101 | LAH(rW0, rD3, 0, 12) | |
102 | LAL(rW0, rD0, 0, 12) | |
103 | LAH(rW1, rD1, 0, 12) | |
104 | LAH(rW2, rD2, 1, 8) | |
105 | LAL(rW2, rD3, 1, 8) | |
106 | LAL(rW3, rD1, 1, 8) | |
107 | LAL(rW4, rD2, 2, 4) | |
108 | LAL(rW6, rD1, 3, 0) | |
109 | LAH(rW5, rD3, 2, 4) | |
110 | LAL(rW5, rD0, 2, 4) | |
111 | LAH(rW7, rD2, 3, 0) | |
112 | evldw rD1,32(rKP) | |
113 | EAD(rD3, 3) | |
114 | evxor rW2,rW2,rW4 | |
115 | LWL(rW7, 0) | |
116 | evxor rW2,rW2,rW6 | |
117 | EAD(rD2, 0) | |
118 | evxor rD1,rD1,rW2 | |
119 | LWL(rW1, 12) | |
120 | evxor rD1,rD1,rW0 | |
121 | evldw rD3,40(rKP) | |
122 | evmergehi rD0,rD0,rD1 | |
123 | EAD(rD1, 2) | |
124 | evxor rW3,rW3,rW5 | |
125 | LWH(rW4, 4) | |
126 | evxor rW3,rW3,rW7 | |
127 | EAD(rD0, 3) | |
128 | evxor rD3,rD3,rW3 | |
129 | LWH(rW6, 0) | |
130 | evxor rD3,rD3,rW1 | |
131 | EAD(rD0, 1) | |
132 | evmergehi rD2,rD2,rD3 | |
133 | LWH(rW3, 8) | |
134 | addi rKP,rKP,32 | |
135 | bdnz ppc_encrypt_block_loop | |
136 | LAH(rW0, rD3, 0, 12) | |
137 | LAL(rW0, rD0, 0, 12) | |
138 | LAH(rW1, rD1, 0, 12) | |
139 | LAH(rW2, rD2, 1, 8) | |
140 | LAL(rW2, rD3, 1, 8) | |
141 | LAL(rW3, rD1, 1, 8) | |
142 | LAL(rW4, rD2, 2, 4) | |
143 | LAH(rW5, rD3, 2, 4) | |
144 | LAL(rW6, rD1, 3, 0) | |
145 | LAL(rW5, rD0, 2, 4) | |
146 | LAH(rW7, rD2, 3, 0) | |
147 | evldw rD1,16(rKP) | |
148 | EAD(rD3, 3) | |
149 | evxor rW2,rW2,rW4 | |
150 | LWL(rW7, 0) | |
151 | evxor rW2,rW2,rW6 | |
152 | EAD(rD2, 0) | |
153 | evxor rD1,rD1,rW2 | |
154 | LWL(rW1, 12) | |
155 | evxor rD1,rD1,rW0 | |
156 | evldw rD3,24(rKP) | |
157 | evmergehi rD0,rD0,rD1 | |
158 | EAD(rD1, 0) | |
159 | evxor rW3,rW3,rW5 | |
160 | LBE(rW2) | |
161 | evxor rW3,rW3,rW7 | |
162 | EAD(rD0, 1) | |
163 | evxor rD3,rD3,rW3 | |
164 | LBE(rW6) | |
165 | evxor rD3,rD3,rW1 | |
166 | EAD(rD0, 0) | |
167 | evmergehi rD2,rD2,rD3 | |
168 | LBE(rW1) | |
169 | LAE(rW0, rD3, 0) | |
170 | LAE(rW1, rD0, 0) | |
171 | LAE(rW4, rD2, 1) | |
172 | LAE(rW5, rD3, 1) | |
173 | LAE(rW3, rD2, 0) | |
174 | LAE(rW7, rD1, 1) | |
175 | rlwimi rW0,rW4,8,16,23 | |
176 | rlwimi rW1,rW5,8,16,23 | |
177 | LAE(rW4, rD1, 2) | |
178 | LAE(rW5, rD2, 2) | |
179 | rlwimi rW2,rW6,8,16,23 | |
180 | rlwimi rW3,rW7,8,16,23 | |
181 | LAE(rW6, rD3, 2) | |
182 | LAE(rW7, rD0, 2) | |
183 | rlwimi rW0,rW4,16,8,15 | |
184 | rlwimi rW1,rW5,16,8,15 | |
185 | LAE(rW4, rD0, 3) | |
186 | LAE(rW5, rD1, 3) | |
187 | rlwimi rW2,rW6,16,8,15 | |
188 | lwz rD0,32(rKP) | |
189 | rlwimi rW3,rW7,16,8,15 | |
190 | lwz rD1,36(rKP) | |
191 | LAE(rW6, rD2, 3) | |
192 | LAE(rW7, rD3, 3) | |
193 | rlwimi rW0,rW4,24,0,7 | |
194 | lwz rD2,40(rKP) | |
195 | rlwimi rW1,rW5,24,0,7 | |
196 | lwz rD3,44(rKP) | |
197 | rlwimi rW2,rW6,24,0,7 | |
198 | rlwimi rW3,rW7,24,0,7 | |
199 | blr | |
200 | ||
201 | /* | |
202 | * ppc_decrypt_block: The central decryption function for a single 16 bytes | |
203 | * block. It does no stack handling or register saving to support fast calls | |
204 | * via bl/blr. It expects that caller has pre-xored input data with first | |
205 | * 4 words of encryption key into rD0-rD3. Pointer/counter registers must | |
206 | * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 | |
446957ba | 207 | * and rW0-rW3 and caller must execute a final xor on the output registers. |
1c201e64 MS |
208 | * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. |
209 | * | |
210 | */ | |
211 | _GLOBAL(ppc_decrypt_block) | |
212 | LAH(rW0, rD1, 0, 12) | |
213 | LAH(rW6, rD0, 3, 0) | |
214 | LAH(rW3, rD0, 1, 8) | |
215 | ppc_decrypt_block_loop: | |
216 | LAH(rW1, rD3, 0, 12) | |
217 | LAL(rW0, rD2, 0, 12) | |
218 | LAH(rW2, rD2, 1, 8) | |
219 | LAL(rW2, rD3, 1, 8) | |
220 | LAH(rW4, rD3, 2, 4) | |
221 | LAL(rW4, rD0, 2, 4) | |
222 | LAL(rW6, rD1, 3, 0) | |
223 | LAH(rW5, rD1, 2, 4) | |
224 | LAH(rW7, rD2, 3, 0) | |
225 | LAL(rW7, rD3, 3, 0) | |
226 | LAL(rW3, rD1, 1, 8) | |
227 | evldw rD1,16(rKP) | |
228 | EAD(rD0, 0) | |
229 | evxor rW4,rW4,rW6 | |
230 | LWL(rW1, 12) | |
231 | evxor rW0,rW0,rW4 | |
232 | EAD(rD2, 2) | |
233 | evxor rW0,rW0,rW2 | |
234 | LWL(rW5, 4) | |
235 | evxor rD1,rD1,rW0 | |
236 | evldw rD3,24(rKP) | |
237 | evmergehi rD0,rD0,rD1 | |
238 | EAD(rD1, 0) | |
239 | evxor rW3,rW3,rW7 | |
240 | LWH(rW0, 12) | |
241 | evxor rW3,rW3,rW1 | |
242 | EAD(rD0, 3) | |
243 | evxor rD3,rD3,rW3 | |
244 | LWH(rW6, 0) | |
245 | evxor rD3,rD3,rW5 | |
246 | EAD(rD0, 1) | |
247 | evmergehi rD2,rD2,rD3 | |
248 | LWH(rW3, 8) | |
249 | LAH(rW1, rD3, 0, 12) | |
250 | LAL(rW0, rD2, 0, 12) | |
251 | LAH(rW2, rD2, 1, 8) | |
252 | LAL(rW2, rD3, 1, 8) | |
253 | LAH(rW4, rD3, 2, 4) | |
254 | LAL(rW4, rD0, 2, 4) | |
255 | LAL(rW6, rD1, 3, 0) | |
256 | LAH(rW5, rD1, 2, 4) | |
257 | LAH(rW7, rD2, 3, 0) | |
258 | LAL(rW7, rD3, 3, 0) | |
259 | LAL(rW3, rD1, 1, 8) | |
260 | evldw rD1,32(rKP) | |
261 | EAD(rD0, 0) | |
262 | evxor rW4,rW4,rW6 | |
263 | LWL(rW1, 12) | |
264 | evxor rW0,rW0,rW4 | |
265 | EAD(rD2, 2) | |
266 | evxor rW0,rW0,rW2 | |
267 | LWL(rW5, 4) | |
268 | evxor rD1,rD1,rW0 | |
269 | evldw rD3,40(rKP) | |
270 | evmergehi rD0,rD0,rD1 | |
271 | EAD(rD1, 0) | |
272 | evxor rW3,rW3,rW7 | |
273 | LWH(rW0, 12) | |
274 | evxor rW3,rW3,rW1 | |
275 | EAD(rD0, 3) | |
276 | evxor rD3,rD3,rW3 | |
277 | LWH(rW6, 0) | |
278 | evxor rD3,rD3,rW5 | |
279 | EAD(rD0, 1) | |
280 | evmergehi rD2,rD2,rD3 | |
281 | LWH(rW3, 8) | |
282 | addi rKP,rKP,32 | |
283 | bdnz ppc_decrypt_block_loop | |
284 | LAH(rW1, rD3, 0, 12) | |
285 | LAL(rW0, rD2, 0, 12) | |
286 | LAH(rW2, rD2, 1, 8) | |
287 | LAL(rW2, rD3, 1, 8) | |
288 | LAH(rW4, rD3, 2, 4) | |
289 | LAL(rW4, rD0, 2, 4) | |
290 | LAL(rW6, rD1, 3, 0) | |
291 | LAH(rW5, rD1, 2, 4) | |
292 | LAH(rW7, rD2, 3, 0) | |
293 | LAL(rW7, rD3, 3, 0) | |
294 | LAL(rW3, rD1, 1, 8) | |
295 | evldw rD1,16(rKP) | |
296 | EAD(rD0, 0) | |
297 | evxor rW4,rW4,rW6 | |
298 | LWL(rW1, 12) | |
299 | evxor rW0,rW0,rW4 | |
300 | EAD(rD2, 2) | |
301 | evxor rW0,rW0,rW2 | |
302 | LWL(rW5, 4) | |
303 | evxor rD1,rD1,rW0 | |
304 | evldw rD3,24(rKP) | |
305 | evmergehi rD0,rD0,rD1 | |
306 | DAD(rD1, 0) | |
307 | evxor rW3,rW3,rW7 | |
308 | LBD(rW0) | |
309 | evxor rW3,rW3,rW1 | |
310 | DAD(rD0, 1) | |
311 | evxor rD3,rD3,rW3 | |
312 | LBD(rW6) | |
313 | evxor rD3,rD3,rW5 | |
314 | DAD(rD0, 0) | |
315 | evmergehi rD2,rD2,rD3 | |
316 | LBD(rW3) | |
317 | LAD(rW2, rD3, 0) | |
318 | LAD(rW1, rD2, 0) | |
319 | LAD(rW4, rD2, 1) | |
320 | LAD(rW5, rD3, 1) | |
321 | LAD(rW7, rD1, 1) | |
322 | rlwimi rW0,rW4,8,16,23 | |
323 | rlwimi rW1,rW5,8,16,23 | |
324 | LAD(rW4, rD3, 2) | |
325 | LAD(rW5, rD0, 2) | |
326 | rlwimi rW2,rW6,8,16,23 | |
327 | rlwimi rW3,rW7,8,16,23 | |
328 | LAD(rW6, rD1, 2) | |
329 | LAD(rW7, rD2, 2) | |
330 | rlwimi rW0,rW4,16,8,15 | |
331 | rlwimi rW1,rW5,16,8,15 | |
332 | LAD(rW4, rD0, 3) | |
333 | LAD(rW5, rD1, 3) | |
334 | rlwimi rW2,rW6,16,8,15 | |
335 | lwz rD0,32(rKP) | |
336 | rlwimi rW3,rW7,16,8,15 | |
337 | lwz rD1,36(rKP) | |
338 | LAD(rW6, rD2, 3) | |
339 | LAD(rW7, rD3, 3) | |
340 | rlwimi rW0,rW4,24,0,7 | |
341 | lwz rD2,40(rKP) | |
342 | rlwimi rW1,rW5,24,0,7 | |
343 | lwz rD3,44(rKP) | |
344 | rlwimi rW2,rW6,24,0,7 | |
345 | rlwimi rW3,rW7,24,0,7 | |
346 | blr |