Merge branch 'x86-pmem-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-block.git] / arch / powerpc / lib / checksum_64.S
CommitLineData
14cf11af
PM
1/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19
20/*
21 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
22 * len is in words and is always >= 5.
23 *
24 * In practice len == 5, but this is not guaranteed. So this code does not
25 * attempt to use doubleword instructions.
26 */
27_GLOBAL(ip_fast_csum)
28 lwz r0,0(r3)
29 lwzu r5,4(r3)
30 addic. r4,r4,-2
31 addc r0,r0,r5
32 mtctr r4
33 blelr-
341: lwzu r4,4(r3)
35 adde r0,r0,r4
36 bdnz 1b
37 addze r0,r0 /* add in final carry */
38 rldicl r4,r0,32,0 /* fold two 32-bit halves together */
39 add r0,r0,r4
40 srdi r0,r0,32
41 rlwinm r3,r0,16,0,31 /* fold two halves together */
42 add r3,r0,r3
43 not r3,r3
44 srwi r3,r3,16
45 blr
46
47/*
48 * Compute checksum of TCP or UDP pseudo-header:
49 * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
50 * No real gain trying to do this specially for 64 bit, but
51 * the 32 bit addition may spill into the upper bits of
52 * the doubleword so we still must fold it down from 64.
53 */
54_GLOBAL(csum_tcpudp_magic)
55 rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
56 addc r0,r3,r4 /* add 4 32-bit words together */
57 adde r0,r0,r5
58 adde r0,r0,r7
59 rldicl r4,r0,32,0 /* fold 64 bit value */
60 add r0,r4,r0
61 srdi r0,r0,32
62 rlwinm r3,r0,16,0,31 /* fold two halves together */
63 add r3,r0,r3
64 not r3,r3
65 srwi r3,r3,16
66 blr
67
68/*
69 * Computes the checksum of a memory block at buff, length len,
70 * and adds in "sum" (32-bit).
71 *
14cf11af
PM
72 * csum_partial(r3=buff, r4=len, r5=sum)
73 */
74_GLOBAL(csum_partial)
9b83ecb0
AB
75 addic r0,r5,0 /* clear carry */
76
77 srdi. r6,r4,3 /* less than 8 bytes? */
78 beq .Lcsum_tail_word
79
80 /*
81 * If only halfword aligned, align to a double word. Since odd
82 * aligned addresses should be rare and they would require more
83 * work to calculate the correct checksum, we ignore that case
84 * and take the potential slowdown of unaligned loads.
85 */
86 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
87 beq .Lcsum_aligned
88
89 li r7,4
90 sub r6,r7,r6
91 mtctr r6
92
931:
94 lhz r6,0(r3) /* align to doubleword */
95 subi r4,r4,2
96 addi r3,r3,2
97 adde r0,r0,r6
98 bdnz 1b
99
100.Lcsum_aligned:
101 /*
102 * We unroll the loop such that each iteration is 64 bytes with an
103 * entry and exit limb of 64 bytes, meaning a minimum size of
104 * 128 bytes.
105 */
106 srdi. r6,r4,7
107 beq .Lcsum_tail_doublewords /* len < 128 */
108
109 srdi r6,r4,6
110 subi r6,r6,1
111 mtctr r6
112
113 stdu r1,-STACKFRAMESIZE(r1)
c75df6f9
MN
114 std r14,STK_REG(R14)(r1)
115 std r15,STK_REG(R15)(r1)
116 std r16,STK_REG(R16)(r1)
9b83ecb0
AB
117
118 ld r6,0(r3)
119 ld r9,8(r3)
120
121 ld r10,16(r3)
122 ld r11,24(r3)
123
124 /*
125 * On POWER6 and POWER7 back to back addes take 2 cycles because of
126 * the XER dependency. This means the fastest this loop can go is
127 * 16 cycles per iteration. The scheduling of the loop below has
128 * been shown to hit this on both POWER6 and POWER7.
129 */
130 .align 5
1312:
132 adde r0,r0,r6
133 ld r12,32(r3)
134 ld r14,40(r3)
135
136 adde r0,r0,r9
137 ld r15,48(r3)
138 ld r16,56(r3)
139 addi r3,r3,64
140
141 adde r0,r0,r10
142
143 adde r0,r0,r11
144
145 adde r0,r0,r12
146
147 adde r0,r0,r14
148
149 adde r0,r0,r15
150 ld r6,0(r3)
151 ld r9,8(r3)
152
153 adde r0,r0,r16
154 ld r10,16(r3)
155 ld r11,24(r3)
156 bdnz 2b
157
158
159 adde r0,r0,r6
160 ld r12,32(r3)
161 ld r14,40(r3)
162
163 adde r0,r0,r9
164 ld r15,48(r3)
165 ld r16,56(r3)
166 addi r3,r3,64
167
168 adde r0,r0,r10
169 adde r0,r0,r11
170 adde r0,r0,r12
171 adde r0,r0,r14
172 adde r0,r0,r15
173 adde r0,r0,r16
174
c75df6f9
MN
175 ld r14,STK_REG(R14)(r1)
176 ld r15,STK_REG(R15)(r1)
177 ld r16,STK_REG(R16)(r1)
9b83ecb0
AB
178 addi r1,r1,STACKFRAMESIZE
179
180 andi. r4,r4,63
181
182.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
183 srdi. r6,r4,3
184 beq .Lcsum_tail_word
185
186 mtctr r6
1873:
188 ld r6,0(r3)
189 addi r3,r3,8
190 adde r0,r0,r6
191 bdnz 3b
192
193 andi. r4,r4,7
194
195.Lcsum_tail_word: /* Up to 7 bytes to go */
196 srdi. r6,r4,2
197 beq .Lcsum_tail_halfword
198
199 lwz r6,0(r3)
14cf11af 200 addi r3,r3,4
9b83ecb0 201 adde r0,r0,r6
14cf11af 202 subi r4,r4,4
9b83ecb0
AB
203
204.Lcsum_tail_halfword: /* Up to 3 bytes to go */
205 srdi. r6,r4,1
206 beq .Lcsum_tail_byte
207
208 lhz r6,0(r3)
209 addi r3,r3,2
210 adde r0,r0,r6
211 subi r4,r4,2
212
213.Lcsum_tail_byte: /* Up to 1 byte to go */
214 andi. r6,r4,1
215 beq .Lcsum_finish
216
217 lbz r6,0(r3)
218 sldi r9,r6,8 /* Pad the byte out to 16 bits */
219 adde r0,r0,r9
220
221.Lcsum_finish:
222 addze r0,r0 /* add in final carry */
223 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
224 add r3,r4,r0
225 srdi r3,r3,32
226 blr
14cf11af 227
fdd374b6 228
8f21bd00 229 .macro srcnr
fdd374b6
AB
230100:
231 .section __ex_table,"a"
232 .align 3
8f21bd00 233 .llong 100b,.Lsrc_error_nr
fdd374b6
AB
234 .previous
235 .endm
236
8f21bd00
PM
237 .macro source
238150:
239 .section __ex_table,"a"
240 .align 3
241 .llong 150b,.Lsrc_error
242 .previous
243 .endm
244
245 .macro dstnr
fdd374b6
AB
246200:
247 .section __ex_table,"a"
248 .align 3
8f21bd00
PM
249 .llong 200b,.Ldest_error_nr
250 .previous
251 .endm
252
253 .macro dest
254250:
255 .section __ex_table,"a"
256 .align 3
257 .llong 250b,.Ldest_error
fdd374b6
AB
258 .previous
259 .endm
260
14cf11af
PM
261/*
262 * Computes the checksum of a memory block at src, length len,
263 * and adds in "sum" (32-bit), while copying the block to dst.
264 * If an access exception occurs on src or dst, it stores -EFAULT
fdd374b6
AB
265 * to *src_err or *dst_err respectively. The caller must take any action
266 * required in this case (zeroing memory, recalculating partial checksum etc).
14cf11af
PM
267 *
268 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
269 */
270_GLOBAL(csum_partial_copy_generic)
fdd374b6
AB
271 addic r0,r6,0 /* clear carry */
272
273 srdi. r6,r5,3 /* less than 8 bytes? */
274 beq .Lcopy_tail_word
275
276 /*
277 * If only halfword aligned, align to a double word. Since odd
278 * aligned addresses should be rare and they would require more
279 * work to calculate the correct checksum, we ignore that case
280 * and take the potential slowdown of unaligned loads.
281 *
282 * If the source and destination are relatively unaligned we only
283 * align the source. This keeps things simple.
284 */
285 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
286 beq .Lcopy_aligned
287
d9813c36
PM
288 li r9,4
289 sub r6,r9,r6
fdd374b6
AB
290 mtctr r6
291
2921:
8f21bd00 293srcnr; lhz r6,0(r3) /* align to doubleword */
14cf11af 294 subi r5,r5,2
14cf11af 295 addi r3,r3,2
fdd374b6 296 adde r0,r0,r6
8f21bd00 297dstnr; sth r6,0(r4)
14cf11af 298 addi r4,r4,2
fdd374b6
AB
299 bdnz 1b
300
301.Lcopy_aligned:
302 /*
303 * We unroll the loop such that each iteration is 64 bytes with an
304 * entry and exit limb of 64 bytes, meaning a minimum size of
305 * 128 bytes.
306 */
307 srdi. r6,r5,7
308 beq .Lcopy_tail_doublewords /* len < 128 */
309
310 srdi r6,r5,6
311 subi r6,r6,1
312 mtctr r6
313
314 stdu r1,-STACKFRAMESIZE(r1)
c75df6f9
MN
315 std r14,STK_REG(R14)(r1)
316 std r15,STK_REG(R15)(r1)
317 std r16,STK_REG(R16)(r1)
fdd374b6
AB
318
319source; ld r6,0(r3)
320source; ld r9,8(r3)
321
322source; ld r10,16(r3)
323source; ld r11,24(r3)
324
325 /*
326 * On POWER6 and POWER7 back to back addes take 2 cycles because of
327 * the XER dependency. This means the fastest this loop can go is
328 * 16 cycles per iteration. The scheduling of the loop below has
329 * been shown to hit this on both POWER6 and POWER7.
330 */
331 .align 5
3322:
333 adde r0,r0,r6
334source; ld r12,32(r3)
335source; ld r14,40(r3)
336
337 adde r0,r0,r9
338source; ld r15,48(r3)
339source; ld r16,56(r3)
340 addi r3,r3,64
341
342 adde r0,r0,r10
343dest; std r6,0(r4)
344dest; std r9,8(r4)
345
346 adde r0,r0,r11
347dest; std r10,16(r4)
348dest; std r11,24(r4)
349
350 adde r0,r0,r12
351dest; std r12,32(r4)
352dest; std r14,40(r4)
353
354 adde r0,r0,r14
355dest; std r15,48(r4)
356dest; std r16,56(r4)
357 addi r4,r4,64
358
359 adde r0,r0,r15
360source; ld r6,0(r3)
361source; ld r9,8(r3)
362
363 adde r0,r0,r16
364source; ld r10,16(r3)
365source; ld r11,24(r3)
366 bdnz 2b
367
368
14cf11af 369 adde r0,r0,r6
fdd374b6
AB
370source; ld r12,32(r3)
371source; ld r14,40(r3)
372
373 adde r0,r0,r9
374source; ld r15,48(r3)
375source; ld r16,56(r3)
376 addi r3,r3,64
377
378 adde r0,r0,r10
379dest; std r6,0(r4)
380dest; std r9,8(r4)
381
382 adde r0,r0,r11
383dest; std r10,16(r4)
384dest; std r11,24(r4)
385
386 adde r0,r0,r12
387dest; std r12,32(r4)
388dest; std r14,40(r4)
389
390 adde r0,r0,r14
391dest; std r15,48(r4)
392dest; std r16,56(r4)
393 addi r4,r4,64
394
395 adde r0,r0,r15
396 adde r0,r0,r16
397
c75df6f9
MN
398 ld r14,STK_REG(R14)(r1)
399 ld r15,STK_REG(R15)(r1)
400 ld r16,STK_REG(R16)(r1)
fdd374b6
AB
401 addi r1,r1,STACKFRAMESIZE
402
403 andi. r5,r5,63
404
405.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
406 srdi. r6,r5,3
407 beq .Lcopy_tail_word
408
409 mtctr r6
4103:
8f21bd00 411srcnr; ld r6,0(r3)
fdd374b6 412 addi r3,r3,8
14cf11af 413 adde r0,r0,r6
8f21bd00 414dstnr; std r6,0(r4)
fdd374b6
AB
415 addi r4,r4,8
416 bdnz 3b
14cf11af 417
fdd374b6 418 andi. r5,r5,7
14cf11af 419
fdd374b6
AB
420.Lcopy_tail_word: /* Up to 7 bytes to go */
421 srdi. r6,r5,2
422 beq .Lcopy_tail_halfword
423
8f21bd00 424srcnr; lwz r6,0(r3)
fdd374b6
AB
425 addi r3,r3,4
426 adde r0,r0,r6
8f21bd00 427dstnr; stw r6,0(r4)
fdd374b6
AB
428 addi r4,r4,4
429 subi r5,r5,4
430
431.Lcopy_tail_halfword: /* Up to 3 bytes to go */
432 srdi. r6,r5,1
433 beq .Lcopy_tail_byte
434
8f21bd00 435srcnr; lhz r6,0(r3)
fdd374b6
AB
436 addi r3,r3,2
437 adde r0,r0,r6
8f21bd00 438dstnr; sth r6,0(r4)
14cf11af 439 addi r4,r4,2
fdd374b6
AB
440 subi r5,r5,2
441
442.Lcopy_tail_byte: /* Up to 1 byte to go */
443 andi. r6,r5,1
444 beq .Lcopy_finish
445
8f21bd00 446srcnr; lbz r6,0(r3)
fdd374b6
AB
447 sldi r9,r6,8 /* Pad the byte out to 16 bits */
448 adde r0,r0,r9
8f21bd00 449dstnr; stb r6,0(r4)
fdd374b6
AB
450
451.Lcopy_finish:
452 addze r0,r0 /* add in final carry */
453 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
454 add r3,r4,r0
455 srdi r3,r3,32
456 blr
457
458.Lsrc_error:
8f21bd00
PM
459 ld r14,STK_REG(R14)(r1)
460 ld r15,STK_REG(R15)(r1)
461 ld r16,STK_REG(R16)(r1)
462 addi r1,r1,STACKFRAMESIZE
463.Lsrc_error_nr:
14cf11af 464 cmpdi 0,r7,0
fdd374b6 465 beqlr
14cf11af
PM
466 li r6,-EFAULT
467 stw r6,0(r7)
14cf11af
PM
468 blr
469
fdd374b6 470.Ldest_error:
8f21bd00
PM
471 ld r14,STK_REG(R14)(r1)
472 ld r15,STK_REG(R15)(r1)
473 ld r16,STK_REG(R16)(r1)
474 addi r1,r1,STACKFRAMESIZE
475.Ldest_error_nr:
14cf11af 476 cmpdi 0,r8,0
fdd374b6 477 beqlr
14cf11af
PM
478 li r6,-EFAULT
479 stw r6,0(r8)
14cf11af 480 blr