checksum_64.S (8149B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * This file contains assembly-language implementations 4 * of IP-style 1's complement checksum routines. 5 * 6 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 7 * 8 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). 9 */ 10 11#include <linux/sys.h> 12#include <asm/processor.h> 13#include <asm/errno.h> 14#include <asm/ppc_asm.h> 15#include <asm/export.h> 16 17/* 18 * Computes the checksum of a memory block at buff, length len, 19 * and adds in "sum" (32-bit). 20 * 21 * __csum_partial(r3=buff, r4=len, r5=sum) 22 */ 23_GLOBAL(__csum_partial) 24 addic r0,r5,0 /* clear carry */ 25 26 srdi. r6,r4,3 /* less than 8 bytes? */ 27 beq .Lcsum_tail_word 28 29 /* 30 * If only halfword aligned, align to a double word. Since odd 31 * aligned addresses should be rare and they would require more 32 * work to calculate the correct checksum, we ignore that case 33 * and take the potential slowdown of unaligned loads. 34 */ 35 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ 36 beq .Lcsum_aligned 37 38 li r7,4 39 sub r6,r7,r6 40 mtctr r6 41 421: 43 lhz r6,0(r3) /* align to doubleword */ 44 subi r4,r4,2 45 addi r3,r3,2 46 adde r0,r0,r6 47 bdnz 1b 48 49.Lcsum_aligned: 50 /* 51 * We unroll the loop such that each iteration is 64 bytes with an 52 * entry and exit limb of 64 bytes, meaning a minimum size of 53 * 128 bytes. 54 */ 55 srdi. r6,r4,7 56 beq .Lcsum_tail_doublewords /* len < 128 */ 57 58 srdi r6,r4,6 59 subi r6,r6,1 60 mtctr r6 61 62 stdu r1,-STACKFRAMESIZE(r1) 63 std r14,STK_REG(R14)(r1) 64 std r15,STK_REG(R15)(r1) 65 std r16,STK_REG(R16)(r1) 66 67 ld r6,0(r3) 68 ld r9,8(r3) 69 70 ld r10,16(r3) 71 ld r11,24(r3) 72 73 /* 74 * On POWER6 and POWER7 back to back adde instructions take 2 cycles 75 * because of the XER dependency. This means the fastest this loop can 76 * go is 16 cycles per iteration. The scheduling of the loop below has 77 * been shown to hit this on both POWER6 and POWER7. 78 */ 79 .align 5 802: 81 adde r0,r0,r6 82 ld r12,32(r3) 83 ld r14,40(r3) 84 85 adde r0,r0,r9 86 ld r15,48(r3) 87 ld r16,56(r3) 88 addi r3,r3,64 89 90 adde r0,r0,r10 91 92 adde r0,r0,r11 93 94 adde r0,r0,r12 95 96 adde r0,r0,r14 97 98 adde r0,r0,r15 99 ld r6,0(r3) 100 ld r9,8(r3) 101 102 adde r0,r0,r16 103 ld r10,16(r3) 104 ld r11,24(r3) 105 bdnz 2b 106 107 108 adde r0,r0,r6 109 ld r12,32(r3) 110 ld r14,40(r3) 111 112 adde r0,r0,r9 113 ld r15,48(r3) 114 ld r16,56(r3) 115 addi r3,r3,64 116 117 adde r0,r0,r10 118 adde r0,r0,r11 119 adde r0,r0,r12 120 adde r0,r0,r14 121 adde r0,r0,r15 122 adde r0,r0,r16 123 124 ld r14,STK_REG(R14)(r1) 125 ld r15,STK_REG(R15)(r1) 126 ld r16,STK_REG(R16)(r1) 127 addi r1,r1,STACKFRAMESIZE 128 129 andi. r4,r4,63 130 131.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ 132 srdi. r6,r4,3 133 beq .Lcsum_tail_word 134 135 mtctr r6 1363: 137 ld r6,0(r3) 138 addi r3,r3,8 139 adde r0,r0,r6 140 bdnz 3b 141 142 andi. r4,r4,7 143 144.Lcsum_tail_word: /* Up to 7 bytes to go */ 145 srdi. r6,r4,2 146 beq .Lcsum_tail_halfword 147 148 lwz r6,0(r3) 149 addi r3,r3,4 150 adde r0,r0,r6 151 subi r4,r4,4 152 153.Lcsum_tail_halfword: /* Up to 3 bytes to go */ 154 srdi. r6,r4,1 155 beq .Lcsum_tail_byte 156 157 lhz r6,0(r3) 158 addi r3,r3,2 159 adde r0,r0,r6 160 subi r4,r4,2 161 162.Lcsum_tail_byte: /* Up to 1 byte to go */ 163 andi. r6,r4,1 164 beq .Lcsum_finish 165 166 lbz r6,0(r3) 167#ifdef __BIG_ENDIAN__ 168 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 169 adde r0,r0,r9 170#else 171 adde r0,r0,r6 172#endif 173 174.Lcsum_finish: 175 addze r0,r0 /* add in final carry */ 176 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 177 add r3,r4,r0 178 srdi r3,r3,32 179 blr 180EXPORT_SYMBOL(__csum_partial) 181 182 183 .macro srcnr 184100: 185 EX_TABLE(100b,.Lerror_nr) 186 .endm 187 188 .macro source 189150: 190 EX_TABLE(150b,.Lerror) 191 .endm 192 193 .macro dstnr 194200: 195 EX_TABLE(200b,.Lerror_nr) 196 .endm 197 198 .macro dest 199250: 200 EX_TABLE(250b,.Lerror) 201 .endm 202 203/* 204 * Computes the checksum of a memory block at src, length len, 205 * and adds in 0xffffffff (32-bit), while copying the block to dst. 206 * If an access exception occurs, it returns 0. 207 * 208 * csum_partial_copy_generic(r3=src, r4=dst, r5=len) 209 */ 210_GLOBAL(csum_partial_copy_generic) 211 li r6,-1 212 addic r0,r6,0 /* clear carry */ 213 214 srdi. r6,r5,3 /* less than 8 bytes? */ 215 beq .Lcopy_tail_word 216 217 /* 218 * If only halfword aligned, align to a double word. Since odd 219 * aligned addresses should be rare and they would require more 220 * work to calculate the correct checksum, we ignore that case 221 * and take the potential slowdown of unaligned loads. 222 * 223 * If the source and destination are relatively unaligned we only 224 * align the source. This keeps things simple. 225 */ 226 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ 227 beq .Lcopy_aligned 228 229 li r9,4 230 sub r6,r9,r6 231 mtctr r6 232 2331: 234srcnr; lhz r6,0(r3) /* align to doubleword */ 235 subi r5,r5,2 236 addi r3,r3,2 237 adde r0,r0,r6 238dstnr; sth r6,0(r4) 239 addi r4,r4,2 240 bdnz 1b 241 242.Lcopy_aligned: 243 /* 244 * We unroll the loop such that each iteration is 64 bytes with an 245 * entry and exit limb of 64 bytes, meaning a minimum size of 246 * 128 bytes. 247 */ 248 srdi. r6,r5,7 249 beq .Lcopy_tail_doublewords /* len < 128 */ 250 251 srdi r6,r5,6 252 subi r6,r6,1 253 mtctr r6 254 255 stdu r1,-STACKFRAMESIZE(r1) 256 std r14,STK_REG(R14)(r1) 257 std r15,STK_REG(R15)(r1) 258 std r16,STK_REG(R16)(r1) 259 260source; ld r6,0(r3) 261source; ld r9,8(r3) 262 263source; ld r10,16(r3) 264source; ld r11,24(r3) 265 266 /* 267 * On POWER6 and POWER7 back to back adde instructions take 2 cycles 268 * because of the XER dependency. This means the fastest this loop can 269 * go is 16 cycles per iteration. The scheduling of the loop below has 270 * been shown to hit this on both POWER6 and POWER7. 271 */ 272 .align 5 2732: 274 adde r0,r0,r6 275source; ld r12,32(r3) 276source; ld r14,40(r3) 277 278 adde r0,r0,r9 279source; ld r15,48(r3) 280source; ld r16,56(r3) 281 addi r3,r3,64 282 283 adde r0,r0,r10 284dest; std r6,0(r4) 285dest; std r9,8(r4) 286 287 adde r0,r0,r11 288dest; std r10,16(r4) 289dest; std r11,24(r4) 290 291 adde r0,r0,r12 292dest; std r12,32(r4) 293dest; std r14,40(r4) 294 295 adde r0,r0,r14 296dest; std r15,48(r4) 297dest; std r16,56(r4) 298 addi r4,r4,64 299 300 adde r0,r0,r15 301source; ld r6,0(r3) 302source; ld r9,8(r3) 303 304 adde r0,r0,r16 305source; ld r10,16(r3) 306source; ld r11,24(r3) 307 bdnz 2b 308 309 310 adde r0,r0,r6 311source; ld r12,32(r3) 312source; ld r14,40(r3) 313 314 adde r0,r0,r9 315source; ld r15,48(r3) 316source; ld r16,56(r3) 317 addi r3,r3,64 318 319 adde r0,r0,r10 320dest; std r6,0(r4) 321dest; std r9,8(r4) 322 323 adde r0,r0,r11 324dest; std r10,16(r4) 325dest; std r11,24(r4) 326 327 adde r0,r0,r12 328dest; std r12,32(r4) 329dest; std r14,40(r4) 330 331 adde r0,r0,r14 332dest; std r15,48(r4) 333dest; std r16,56(r4) 334 addi r4,r4,64 335 336 adde r0,r0,r15 337 adde r0,r0,r16 338 339 ld r14,STK_REG(R14)(r1) 340 ld r15,STK_REG(R15)(r1) 341 ld r16,STK_REG(R16)(r1) 342 addi r1,r1,STACKFRAMESIZE 343 344 andi. r5,r5,63 345 346.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ 347 srdi. r6,r5,3 348 beq .Lcopy_tail_word 349 350 mtctr r6 3513: 352srcnr; ld r6,0(r3) 353 addi r3,r3,8 354 adde r0,r0,r6 355dstnr; std r6,0(r4) 356 addi r4,r4,8 357 bdnz 3b 358 359 andi. r5,r5,7 360 361.Lcopy_tail_word: /* Up to 7 bytes to go */ 362 srdi. r6,r5,2 363 beq .Lcopy_tail_halfword 364 365srcnr; lwz r6,0(r3) 366 addi r3,r3,4 367 adde r0,r0,r6 368dstnr; stw r6,0(r4) 369 addi r4,r4,4 370 subi r5,r5,4 371 372.Lcopy_tail_halfword: /* Up to 3 bytes to go */ 373 srdi. r6,r5,1 374 beq .Lcopy_tail_byte 375 376srcnr; lhz r6,0(r3) 377 addi r3,r3,2 378 adde r0,r0,r6 379dstnr; sth r6,0(r4) 380 addi r4,r4,2 381 subi r5,r5,2 382 383.Lcopy_tail_byte: /* Up to 1 byte to go */ 384 andi. r6,r5,1 385 beq .Lcopy_finish 386 387srcnr; lbz r6,0(r3) 388#ifdef __BIG_ENDIAN__ 389 sldi r9,r6,8 /* Pad the byte out to 16 bits */ 390 adde r0,r0,r9 391#else 392 adde r0,r0,r6 393#endif 394dstnr; stb r6,0(r4) 395 396.Lcopy_finish: 397 addze r0,r0 /* add in final carry */ 398 rldicl r4,r0,32,0 /* fold two 32 bit halves together */ 399 add r3,r4,r0 400 srdi r3,r3,32 401 blr 402 403.Lerror: 404 ld r14,STK_REG(R14)(r1) 405 ld r15,STK_REG(R15)(r1) 406 ld r16,STK_REG(R16)(r1) 407 addi r1,r1,STACKFRAMESIZE 408.Lerror_nr: 409 li r3,0 410 blr 411 412EXPORT_SYMBOL(csum_partial_copy_generic) 413 414/* 415 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr, 416 * const struct in6_addr *daddr, 417 * __u32 len, __u8 proto, __wsum sum) 418 */ 419 420_GLOBAL(csum_ipv6_magic) 421 ld r8, 0(r3) 422 ld r9, 8(r3) 423 add r5, r5, r6 424 addc r0, r8, r9 425 ld r10, 0(r4) 426 ld r11, 8(r4) 427#ifdef CONFIG_CPU_LITTLE_ENDIAN 428 rotldi r5, r5, 8 429#endif 430 adde r0, r0, r10 431 add r5, r5, r7 432 adde r0, r0, r11 433 adde r0, r0, r5 434 addze r0, r0 435 rotldi r3, r0, 32 /* fold two 32 bit halves together */ 436 add r3, r0, r3 437 srdi r0, r3, 32 438 rotlwi r3, r0, 16 /* fold two 16 bit halves together */ 439 add r3, r0, r3 440 not r3, r3 441 rlwinm r3, r3, 16, 16, 31 442 blr 443EXPORT_SYMBOL(csum_ipv6_magic)