Merge tag 'kbuild-v4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy...
[linux-block.git] / arch / powerpc / lib / checksum_64.S
CommitLineData
14cf11af
PM
1/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
9445aa1a 19#include <asm/export.h>
14cf11af 20
14cf11af
PM
21/*
22 * Computes the checksum of a memory block at buff, length len,
23 * and adds in "sum" (32-bit).
24 *
7e393220 25 * __csum_partial(r3=buff, r4=len, r5=sum)
14cf11af 26 */
7e393220 27_GLOBAL(__csum_partial)
9b83ecb0
AB
28 addic r0,r5,0 /* clear carry */
29
30 srdi. r6,r4,3 /* less than 8 bytes? */
31 beq .Lcsum_tail_word
32
33 /*
34 * If only halfword aligned, align to a double word. Since odd
35 * aligned addresses should be rare and they would require more
36 * work to calculate the correct checksum, we ignore that case
37 * and take the potential slowdown of unaligned loads.
38 */
d4fde568 39 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
9b83ecb0
AB
40 beq .Lcsum_aligned
41
42 li r7,4
43 sub r6,r7,r6
44 mtctr r6
45
461:
47 lhz r6,0(r3) /* align to doubleword */
48 subi r4,r4,2
49 addi r3,r3,2
50 adde r0,r0,r6
51 bdnz 1b
52
53.Lcsum_aligned:
54 /*
55 * We unroll the loop such that each iteration is 64 bytes with an
56 * entry and exit limb of 64 bytes, meaning a minimum size of
57 * 128 bytes.
58 */
59 srdi. r6,r4,7
60 beq .Lcsum_tail_doublewords /* len < 128 */
61
62 srdi r6,r4,6
63 subi r6,r6,1
64 mtctr r6
65
66 stdu r1,-STACKFRAMESIZE(r1)
c75df6f9
MN
67 std r14,STK_REG(R14)(r1)
68 std r15,STK_REG(R15)(r1)
69 std r16,STK_REG(R16)(r1)
9b83ecb0
AB
70
71 ld r6,0(r3)
72 ld r9,8(r3)
73
74 ld r10,16(r3)
75 ld r11,24(r3)
76
77 /*
ec5619fd
SS
78 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
79 * because of the XER dependency. This means the fastest this loop can
80 * go is 16 cycles per iteration. The scheduling of the loop below has
9b83ecb0
AB
81 * been shown to hit this on both POWER6 and POWER7.
82 */
83 .align 5
842:
85 adde r0,r0,r6
86 ld r12,32(r3)
87 ld r14,40(r3)
88
89 adde r0,r0,r9
90 ld r15,48(r3)
91 ld r16,56(r3)
92 addi r3,r3,64
93
94 adde r0,r0,r10
95
96 adde r0,r0,r11
97
98 adde r0,r0,r12
99
100 adde r0,r0,r14
101
102 adde r0,r0,r15
103 ld r6,0(r3)
104 ld r9,8(r3)
105
106 adde r0,r0,r16
107 ld r10,16(r3)
108 ld r11,24(r3)
109 bdnz 2b
110
111
112 adde r0,r0,r6
113 ld r12,32(r3)
114 ld r14,40(r3)
115
116 adde r0,r0,r9
117 ld r15,48(r3)
118 ld r16,56(r3)
119 addi r3,r3,64
120
121 adde r0,r0,r10
122 adde r0,r0,r11
123 adde r0,r0,r12
124 adde r0,r0,r14
125 adde r0,r0,r15
126 adde r0,r0,r16
127
c75df6f9
MN
128 ld r14,STK_REG(R14)(r1)
129 ld r15,STK_REG(R15)(r1)
130 ld r16,STK_REG(R16)(r1)
9b83ecb0
AB
131 addi r1,r1,STACKFRAMESIZE
132
133 andi. r4,r4,63
134
135.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
136 srdi. r6,r4,3
137 beq .Lcsum_tail_word
138
139 mtctr r6
1403:
141 ld r6,0(r3)
142 addi r3,r3,8
143 adde r0,r0,r6
144 bdnz 3b
145
146 andi. r4,r4,7
147
148.Lcsum_tail_word: /* Up to 7 bytes to go */
149 srdi. r6,r4,2
150 beq .Lcsum_tail_halfword
151
152 lwz r6,0(r3)
14cf11af 153 addi r3,r3,4
9b83ecb0 154 adde r0,r0,r6
14cf11af 155 subi r4,r4,4
9b83ecb0
AB
156
157.Lcsum_tail_halfword: /* Up to 3 bytes to go */
158 srdi. r6,r4,1
159 beq .Lcsum_tail_byte
160
161 lhz r6,0(r3)
162 addi r3,r3,2
163 adde r0,r0,r6
164 subi r4,r4,2
165
166.Lcsum_tail_byte: /* Up to 1 byte to go */
167 andi. r6,r4,1
168 beq .Lcsum_finish
169
170 lbz r6,0(r3)
d4fde568 171#ifdef __BIG_ENDIAN__
9b83ecb0
AB
172 sldi r9,r6,8 /* Pad the byte out to 16 bits */
173 adde r0,r0,r9
d4fde568
PM
174#else
175 adde r0,r0,r6
176#endif
9b83ecb0
AB
177
178.Lcsum_finish:
179 addze r0,r0 /* add in final carry */
180 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
181 add r3,r4,r0
182 srdi r3,r3,32
183 blr
9445aa1a 184EXPORT_SYMBOL(__csum_partial)
14cf11af 185
fdd374b6 186
8f21bd00 187 .macro srcnr
fdd374b6 188100:
24bfa6a9 189 EX_TABLE(100b,.Lsrc_error_nr)
fdd374b6
AB
190 .endm
191
8f21bd00
PM
192 .macro source
193150:
24bfa6a9 194 EX_TABLE(150b,.Lsrc_error)
8f21bd00
PM
195 .endm
196
197 .macro dstnr
fdd374b6 198200:
24bfa6a9 199 EX_TABLE(200b,.Ldest_error_nr)
8f21bd00
PM
200 .endm
201
202 .macro dest
203250:
24bfa6a9 204 EX_TABLE(250b,.Ldest_error)
fdd374b6
AB
205 .endm
206
14cf11af
PM
207/*
208 * Computes the checksum of a memory block at src, length len,
209 * and adds in "sum" (32-bit), while copying the block to dst.
210 * If an access exception occurs on src or dst, it stores -EFAULT
fdd374b6
AB
211 * to *src_err or *dst_err respectively. The caller must take any action
212 * required in this case (zeroing memory, recalculating partial checksum etc).
14cf11af
PM
213 *
214 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
215 */
216_GLOBAL(csum_partial_copy_generic)
fdd374b6
AB
217 addic r0,r6,0 /* clear carry */
218
219 srdi. r6,r5,3 /* less than 8 bytes? */
220 beq .Lcopy_tail_word
221
222 /*
223 * If only halfword aligned, align to a double word. Since odd
224 * aligned addresses should be rare and they would require more
225 * work to calculate the correct checksum, we ignore that case
226 * and take the potential slowdown of unaligned loads.
227 *
228 * If the source and destination are relatively unaligned we only
229 * align the source. This keeps things simple.
230 */
d4fde568 231 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
fdd374b6
AB
232 beq .Lcopy_aligned
233
d9813c36
PM
234 li r9,4
235 sub r6,r9,r6
fdd374b6
AB
236 mtctr r6
237
2381:
8f21bd00 239srcnr; lhz r6,0(r3) /* align to doubleword */
14cf11af 240 subi r5,r5,2
14cf11af 241 addi r3,r3,2
fdd374b6 242 adde r0,r0,r6
8f21bd00 243dstnr; sth r6,0(r4)
14cf11af 244 addi r4,r4,2
fdd374b6
AB
245 bdnz 1b
246
247.Lcopy_aligned:
248 /*
249 * We unroll the loop such that each iteration is 64 bytes with an
250 * entry and exit limb of 64 bytes, meaning a minimum size of
251 * 128 bytes.
252 */
253 srdi. r6,r5,7
254 beq .Lcopy_tail_doublewords /* len < 128 */
255
256 srdi r6,r5,6
257 subi r6,r6,1
258 mtctr r6
259
260 stdu r1,-STACKFRAMESIZE(r1)
c75df6f9
MN
261 std r14,STK_REG(R14)(r1)
262 std r15,STK_REG(R15)(r1)
263 std r16,STK_REG(R16)(r1)
fdd374b6
AB
264
265source; ld r6,0(r3)
266source; ld r9,8(r3)
267
268source; ld r10,16(r3)
269source; ld r11,24(r3)
270
271 /*
ec5619fd
SS
272 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
273 * because of the XER dependency. This means the fastest this loop can
274 * go is 16 cycles per iteration. The scheduling of the loop below has
fdd374b6
AB
275 * been shown to hit this on both POWER6 and POWER7.
276 */
277 .align 5
2782:
279 adde r0,r0,r6
280source; ld r12,32(r3)
281source; ld r14,40(r3)
282
283 adde r0,r0,r9
284source; ld r15,48(r3)
285source; ld r16,56(r3)
286 addi r3,r3,64
287
288 adde r0,r0,r10
289dest; std r6,0(r4)
290dest; std r9,8(r4)
291
292 adde r0,r0,r11
293dest; std r10,16(r4)
294dest; std r11,24(r4)
295
296 adde r0,r0,r12
297dest; std r12,32(r4)
298dest; std r14,40(r4)
299
300 adde r0,r0,r14
301dest; std r15,48(r4)
302dest; std r16,56(r4)
303 addi r4,r4,64
304
305 adde r0,r0,r15
306source; ld r6,0(r3)
307source; ld r9,8(r3)
308
309 adde r0,r0,r16
310source; ld r10,16(r3)
311source; ld r11,24(r3)
312 bdnz 2b
313
314
14cf11af 315 adde r0,r0,r6
fdd374b6
AB
316source; ld r12,32(r3)
317source; ld r14,40(r3)
318
319 adde r0,r0,r9
320source; ld r15,48(r3)
321source; ld r16,56(r3)
322 addi r3,r3,64
323
324 adde r0,r0,r10
325dest; std r6,0(r4)
326dest; std r9,8(r4)
327
328 adde r0,r0,r11
329dest; std r10,16(r4)
330dest; std r11,24(r4)
331
332 adde r0,r0,r12
333dest; std r12,32(r4)
334dest; std r14,40(r4)
335
336 adde r0,r0,r14
337dest; std r15,48(r4)
338dest; std r16,56(r4)
339 addi r4,r4,64
340
341 adde r0,r0,r15
342 adde r0,r0,r16
343
c75df6f9
MN
344 ld r14,STK_REG(R14)(r1)
345 ld r15,STK_REG(R15)(r1)
346 ld r16,STK_REG(R16)(r1)
fdd374b6
AB
347 addi r1,r1,STACKFRAMESIZE
348
349 andi. r5,r5,63
350
351.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
352 srdi. r6,r5,3
353 beq .Lcopy_tail_word
354
355 mtctr r6
3563:
8f21bd00 357srcnr; ld r6,0(r3)
fdd374b6 358 addi r3,r3,8
14cf11af 359 adde r0,r0,r6
8f21bd00 360dstnr; std r6,0(r4)
fdd374b6
AB
361 addi r4,r4,8
362 bdnz 3b
14cf11af 363
fdd374b6 364 andi. r5,r5,7
14cf11af 365
fdd374b6
AB
366.Lcopy_tail_word: /* Up to 7 bytes to go */
367 srdi. r6,r5,2
368 beq .Lcopy_tail_halfword
369
8f21bd00 370srcnr; lwz r6,0(r3)
fdd374b6
AB
371 addi r3,r3,4
372 adde r0,r0,r6
8f21bd00 373dstnr; stw r6,0(r4)
fdd374b6
AB
374 addi r4,r4,4
375 subi r5,r5,4
376
377.Lcopy_tail_halfword: /* Up to 3 bytes to go */
378 srdi. r6,r5,1
379 beq .Lcopy_tail_byte
380
8f21bd00 381srcnr; lhz r6,0(r3)
fdd374b6
AB
382 addi r3,r3,2
383 adde r0,r0,r6
8f21bd00 384dstnr; sth r6,0(r4)
14cf11af 385 addi r4,r4,2
fdd374b6
AB
386 subi r5,r5,2
387
388.Lcopy_tail_byte: /* Up to 1 byte to go */
389 andi. r6,r5,1
390 beq .Lcopy_finish
391
8f21bd00 392srcnr; lbz r6,0(r3)
d4fde568 393#ifdef __BIG_ENDIAN__
fdd374b6
AB
394 sldi r9,r6,8 /* Pad the byte out to 16 bits */
395 adde r0,r0,r9
d4fde568
PM
396#else
397 adde r0,r0,r6
398#endif
8f21bd00 399dstnr; stb r6,0(r4)
fdd374b6
AB
400
401.Lcopy_finish:
402 addze r0,r0 /* add in final carry */
403 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
404 add r3,r4,r0
405 srdi r3,r3,32
406 blr
407
408.Lsrc_error:
8f21bd00
PM
409 ld r14,STK_REG(R14)(r1)
410 ld r15,STK_REG(R15)(r1)
411 ld r16,STK_REG(R16)(r1)
412 addi r1,r1,STACKFRAMESIZE
413.Lsrc_error_nr:
14cf11af 414 cmpdi 0,r7,0
fdd374b6 415 beqlr
14cf11af
PM
416 li r6,-EFAULT
417 stw r6,0(r7)
14cf11af
PM
418 blr
419
fdd374b6 420.Ldest_error:
8f21bd00
PM
421 ld r14,STK_REG(R14)(r1)
422 ld r15,STK_REG(R15)(r1)
423 ld r16,STK_REG(R16)(r1)
424 addi r1,r1,STACKFRAMESIZE
425.Ldest_error_nr:
14cf11af 426 cmpdi 0,r8,0
fdd374b6 427 beqlr
14cf11af
PM
428 li r6,-EFAULT
429 stw r6,0(r8)
14cf11af 430 blr
9445aa1a 431EXPORT_SYMBOL(csum_partial_copy_generic)