memcpy_power7.S (10214B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * 4 * Copyright (C) IBM Corporation, 2012 5 * 6 * Author: Anton Blanchard <anton@au.ibm.com> 7 */ 8#include <asm/ppc_asm.h> 9 10#ifndef SELFTEST_CASE 11/* 0 == don't use VMX, 1 == use VMX */ 12#define SELFTEST_CASE 0 13#endif 14 15#ifdef __BIG_ENDIAN__ 16#define LVS(VRT,RA,RB) lvsl VRT,RA,RB 17#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 18#else 19#define LVS(VRT,RA,RB) lvsr VRT,RA,RB 20#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 21#endif 22 23_GLOBAL(memcpy_power7) 24 cmpldi r5,16 25 cmpldi cr1,r5,4096 26 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 27 blt .Lshort_copy 28 29#ifdef CONFIG_ALTIVEC 30test_feature = SELFTEST_CASE 31BEGIN_FTR_SECTION 32 bgt cr1, .Lvmx_copy 33END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) 34#endif 35 36.Lnonvmx_copy: 37 /* Get the source 8B aligned */ 38 neg r6,r4 39 mtocrf 0x01,r6 40 clrldi r6,r6,(64-3) 41 42 bf cr7*4+3,1f 43 lbz r0,0(r4) 44 addi r4,r4,1 45 stb r0,0(r3) 46 addi r3,r3,1 47 481: bf cr7*4+2,2f 49 lhz r0,0(r4) 50 addi r4,r4,2 51 sth r0,0(r3) 52 addi r3,r3,2 53 542: bf cr7*4+1,3f 55 lwz r0,0(r4) 56 addi r4,r4,4 57 stw r0,0(r3) 58 addi r3,r3,4 59 603: sub r5,r5,r6 61 cmpldi r5,128 62 blt 5f 63 64 mflr r0 65 stdu r1,-STACKFRAMESIZE(r1) 66 std r14,STK_REG(R14)(r1) 67 std r15,STK_REG(R15)(r1) 68 std r16,STK_REG(R16)(r1) 69 std r17,STK_REG(R17)(r1) 70 std r18,STK_REG(R18)(r1) 71 std r19,STK_REG(R19)(r1) 72 std r20,STK_REG(R20)(r1) 73 std r21,STK_REG(R21)(r1) 74 std r22,STK_REG(R22)(r1) 75 std r0,STACKFRAMESIZE+16(r1) 76 77 srdi r6,r5,7 78 mtctr r6 79 80 /* Now do cacheline (128B) sized loads and stores. */ 81 .align 5 824: 83 ld r0,0(r4) 84 ld r6,8(r4) 85 ld r7,16(r4) 86 ld r8,24(r4) 87 ld r9,32(r4) 88 ld r10,40(r4) 89 ld r11,48(r4) 90 ld r12,56(r4) 91 ld r14,64(r4) 92 ld r15,72(r4) 93 ld r16,80(r4) 94 ld r17,88(r4) 95 ld r18,96(r4) 96 ld r19,104(r4) 97 ld r20,112(r4) 98 ld r21,120(r4) 99 addi r4,r4,128 100 std r0,0(r3) 101 std r6,8(r3) 102 std r7,16(r3) 103 std r8,24(r3) 104 std r9,32(r3) 105 std r10,40(r3) 106 std r11,48(r3) 107 std r12,56(r3) 108 std r14,64(r3) 109 std r15,72(r3) 110 std r16,80(r3) 111 std r17,88(r3) 112 std r18,96(r3) 113 std r19,104(r3) 114 std r20,112(r3) 115 std r21,120(r3) 116 addi r3,r3,128 117 bdnz 4b 118 119 clrldi r5,r5,(64-7) 120 121 ld r14,STK_REG(R14)(r1) 122 ld r15,STK_REG(R15)(r1) 123 ld r16,STK_REG(R16)(r1) 124 ld r17,STK_REG(R17)(r1) 125 ld r18,STK_REG(R18)(r1) 126 ld r19,STK_REG(R19)(r1) 127 ld r20,STK_REG(R20)(r1) 128 ld r21,STK_REG(R21)(r1) 129 ld r22,STK_REG(R22)(r1) 130 addi r1,r1,STACKFRAMESIZE 131 132 /* Up to 127B to go */ 1335: srdi r6,r5,4 134 mtocrf 0x01,r6 135 1366: bf cr7*4+1,7f 137 ld r0,0(r4) 138 ld r6,8(r4) 139 ld r7,16(r4) 140 ld r8,24(r4) 141 ld r9,32(r4) 142 ld r10,40(r4) 143 ld r11,48(r4) 144 ld r12,56(r4) 145 addi r4,r4,64 146 std r0,0(r3) 147 std r6,8(r3) 148 std r7,16(r3) 149 std r8,24(r3) 150 std r9,32(r3) 151 std r10,40(r3) 152 std r11,48(r3) 153 std r12,56(r3) 154 addi r3,r3,64 155 156 /* Up to 63B to go */ 1577: bf cr7*4+2,8f 158 ld r0,0(r4) 159 ld r6,8(r4) 160 ld r7,16(r4) 161 ld r8,24(r4) 162 addi r4,r4,32 163 std r0,0(r3) 164 std r6,8(r3) 165 std r7,16(r3) 166 std r8,24(r3) 167 addi r3,r3,32 168 169 /* Up to 31B to go */ 1708: bf cr7*4+3,9f 171 ld r0,0(r4) 172 ld r6,8(r4) 173 addi r4,r4,16 174 std r0,0(r3) 175 std r6,8(r3) 176 addi r3,r3,16 177 1789: clrldi r5,r5,(64-4) 179 180 /* Up to 15B to go */ 181.Lshort_copy: 182 mtocrf 0x01,r5 183 bf cr7*4+0,12f 184 lwz r0,0(r4) /* Less chance of a reject with word ops */ 185 lwz r6,4(r4) 186 addi r4,r4,8 187 stw r0,0(r3) 188 stw r6,4(r3) 189 addi r3,r3,8 190 19112: bf cr7*4+1,13f 192 lwz r0,0(r4) 193 addi r4,r4,4 194 stw r0,0(r3) 195 addi r3,r3,4 196 19713: bf cr7*4+2,14f 198 lhz r0,0(r4) 199 addi r4,r4,2 200 sth r0,0(r3) 201 addi r3,r3,2 202 20314: bf cr7*4+3,15f 204 lbz r0,0(r4) 205 stb r0,0(r3) 206 20715: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 208 blr 209 210.Lunwind_stack_nonvmx_copy: 211 addi r1,r1,STACKFRAMESIZE 212 b .Lnonvmx_copy 213 214.Lvmx_copy: 215#ifdef CONFIG_ALTIVEC 216 mflr r0 217 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 218 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 219 std r0,16(r1) 220 stdu r1,-STACKFRAMESIZE(r1) 221 bl enter_vmx_ops 222 cmpwi cr1,r3,0 223 ld r0,STACKFRAMESIZE+16(r1) 224 ld r3,STK_REG(R31)(r1) 225 ld r4,STK_REG(R30)(r1) 226 ld r5,STK_REG(R29)(r1) 227 mtlr r0 228 229 /* 230 * We prefetch both the source and destination using enhanced touch 231 * instructions. We use a stream ID of 0 for the load side and 232 * 1 for the store side. 233 */ 234 clrrdi r6,r4,7 235 clrrdi r9,r3,7 236 ori r9,r9,1 /* stream=1 */ 237 238 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 239 cmpldi r7,0x3FF 240 ble 1f 241 li r7,0x3FF 2421: lis r0,0x0E00 /* depth=7 */ 243 sldi r7,r7,7 244 or r7,r7,r0 245 ori r10,r7,1 /* stream=1 */ 246 247 lis r8,0x8000 /* GO=1 */ 248 clrldi r8,r8,32 249 250 dcbt 0,r6,0b01000 251 dcbt 0,r7,0b01010 252 dcbtst 0,r9,0b01000 253 dcbtst 0,r10,0b01010 254 eieio 255 dcbt 0,r8,0b01010 /* GO */ 256 257 beq cr1,.Lunwind_stack_nonvmx_copy 258 259 /* 260 * If source and destination are not relatively aligned we use a 261 * slower permute loop. 262 */ 263 xor r6,r4,r3 264 rldicl. r6,r6,0,(64-4) 265 bne .Lvmx_unaligned_copy 266 267 /* Get the destination 16B aligned */ 268 neg r6,r3 269 mtocrf 0x01,r6 270 clrldi r6,r6,(64-4) 271 272 bf cr7*4+3,1f 273 lbz r0,0(r4) 274 addi r4,r4,1 275 stb r0,0(r3) 276 addi r3,r3,1 277 2781: bf cr7*4+2,2f 279 lhz r0,0(r4) 280 addi r4,r4,2 281 sth r0,0(r3) 282 addi r3,r3,2 283 2842: bf cr7*4+1,3f 285 lwz r0,0(r4) 286 addi r4,r4,4 287 stw r0,0(r3) 288 addi r3,r3,4 289 2903: bf cr7*4+0,4f 291 ld r0,0(r4) 292 addi r4,r4,8 293 std r0,0(r3) 294 addi r3,r3,8 295 2964: sub r5,r5,r6 297 298 /* Get the desination 128B aligned */ 299 neg r6,r3 300 srdi r7,r6,4 301 mtocrf 0x01,r7 302 clrldi r6,r6,(64-7) 303 304 li r9,16 305 li r10,32 306 li r11,48 307 308 bf cr7*4+3,5f 309 lvx v1,0,r4 310 addi r4,r4,16 311 stvx v1,0,r3 312 addi r3,r3,16 313 3145: bf cr7*4+2,6f 315 lvx v1,0,r4 316 lvx v0,r4,r9 317 addi r4,r4,32 318 stvx v1,0,r3 319 stvx v0,r3,r9 320 addi r3,r3,32 321 3226: bf cr7*4+1,7f 323 lvx v3,0,r4 324 lvx v2,r4,r9 325 lvx v1,r4,r10 326 lvx v0,r4,r11 327 addi r4,r4,64 328 stvx v3,0,r3 329 stvx v2,r3,r9 330 stvx v1,r3,r10 331 stvx v0,r3,r11 332 addi r3,r3,64 333 3347: sub r5,r5,r6 335 srdi r6,r5,7 336 337 std r14,STK_REG(R14)(r1) 338 std r15,STK_REG(R15)(r1) 339 std r16,STK_REG(R16)(r1) 340 341 li r12,64 342 li r14,80 343 li r15,96 344 li r16,112 345 346 mtctr r6 347 348 /* 349 * Now do cacheline sized loads and stores. By this stage the 350 * cacheline stores are also cacheline aligned. 351 */ 352 .align 5 3538: 354 lvx v7,0,r4 355 lvx v6,r4,r9 356 lvx v5,r4,r10 357 lvx v4,r4,r11 358 lvx v3,r4,r12 359 lvx v2,r4,r14 360 lvx v1,r4,r15 361 lvx v0,r4,r16 362 addi r4,r4,128 363 stvx v7,0,r3 364 stvx v6,r3,r9 365 stvx v5,r3,r10 366 stvx v4,r3,r11 367 stvx v3,r3,r12 368 stvx v2,r3,r14 369 stvx v1,r3,r15 370 stvx v0,r3,r16 371 addi r3,r3,128 372 bdnz 8b 373 374 ld r14,STK_REG(R14)(r1) 375 ld r15,STK_REG(R15)(r1) 376 ld r16,STK_REG(R16)(r1) 377 378 /* Up to 127B to go */ 379 clrldi r5,r5,(64-7) 380 srdi r6,r5,4 381 mtocrf 0x01,r6 382 383 bf cr7*4+1,9f 384 lvx v3,0,r4 385 lvx v2,r4,r9 386 lvx v1,r4,r10 387 lvx v0,r4,r11 388 addi r4,r4,64 389 stvx v3,0,r3 390 stvx v2,r3,r9 391 stvx v1,r3,r10 392 stvx v0,r3,r11 393 addi r3,r3,64 394 3959: bf cr7*4+2,10f 396 lvx v1,0,r4 397 lvx v0,r4,r9 398 addi r4,r4,32 399 stvx v1,0,r3 400 stvx v0,r3,r9 401 addi r3,r3,32 402 40310: bf cr7*4+3,11f 404 lvx v1,0,r4 405 addi r4,r4,16 406 stvx v1,0,r3 407 addi r3,r3,16 408 409 /* Up to 15B to go */ 41011: clrldi r5,r5,(64-4) 411 mtocrf 0x01,r5 412 bf cr7*4+0,12f 413 ld r0,0(r4) 414 addi r4,r4,8 415 std r0,0(r3) 416 addi r3,r3,8 417 41812: bf cr7*4+1,13f 419 lwz r0,0(r4) 420 addi r4,r4,4 421 stw r0,0(r3) 422 addi r3,r3,4 423 42413: bf cr7*4+2,14f 425 lhz r0,0(r4) 426 addi r4,r4,2 427 sth r0,0(r3) 428 addi r3,r3,2 429 43014: bf cr7*4+3,15f 431 lbz r0,0(r4) 432 stb r0,0(r3) 433 43415: addi r1,r1,STACKFRAMESIZE 435 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 436 b exit_vmx_ops /* tail call optimise */ 437 438.Lvmx_unaligned_copy: 439 /* Get the destination 16B aligned */ 440 neg r6,r3 441 mtocrf 0x01,r6 442 clrldi r6,r6,(64-4) 443 444 bf cr7*4+3,1f 445 lbz r0,0(r4) 446 addi r4,r4,1 447 stb r0,0(r3) 448 addi r3,r3,1 449 4501: bf cr7*4+2,2f 451 lhz r0,0(r4) 452 addi r4,r4,2 453 sth r0,0(r3) 454 addi r3,r3,2 455 4562: bf cr7*4+1,3f 457 lwz r0,0(r4) 458 addi r4,r4,4 459 stw r0,0(r3) 460 addi r3,r3,4 461 4623: bf cr7*4+0,4f 463 lwz r0,0(r4) /* Less chance of a reject with word ops */ 464 lwz r7,4(r4) 465 addi r4,r4,8 466 stw r0,0(r3) 467 stw r7,4(r3) 468 addi r3,r3,8 469 4704: sub r5,r5,r6 471 472 /* Get the desination 128B aligned */ 473 neg r6,r3 474 srdi r7,r6,4 475 mtocrf 0x01,r7 476 clrldi r6,r6,(64-7) 477 478 li r9,16 479 li r10,32 480 li r11,48 481 482 LVS(v16,0,r4) /* Setup permute control vector */ 483 lvx v0,0,r4 484 addi r4,r4,16 485 486 bf cr7*4+3,5f 487 lvx v1,0,r4 488 VPERM(v8,v0,v1,v16) 489 addi r4,r4,16 490 stvx v8,0,r3 491 addi r3,r3,16 492 vor v0,v1,v1 493 4945: bf cr7*4+2,6f 495 lvx v1,0,r4 496 VPERM(v8,v0,v1,v16) 497 lvx v0,r4,r9 498 VPERM(v9,v1,v0,v16) 499 addi r4,r4,32 500 stvx v8,0,r3 501 stvx v9,r3,r9 502 addi r3,r3,32 503 5046: bf cr7*4+1,7f 505 lvx v3,0,r4 506 VPERM(v8,v0,v3,v16) 507 lvx v2,r4,r9 508 VPERM(v9,v3,v2,v16) 509 lvx v1,r4,r10 510 VPERM(v10,v2,v1,v16) 511 lvx v0,r4,r11 512 VPERM(v11,v1,v0,v16) 513 addi r4,r4,64 514 stvx v8,0,r3 515 stvx v9,r3,r9 516 stvx v10,r3,r10 517 stvx v11,r3,r11 518 addi r3,r3,64 519 5207: sub r5,r5,r6 521 srdi r6,r5,7 522 523 std r14,STK_REG(R14)(r1) 524 std r15,STK_REG(R15)(r1) 525 std r16,STK_REG(R16)(r1) 526 527 li r12,64 528 li r14,80 529 li r15,96 530 li r16,112 531 532 mtctr r6 533 534 /* 535 * Now do cacheline sized loads and stores. By this stage the 536 * cacheline stores are also cacheline aligned. 537 */ 538 .align 5 5398: 540 lvx v7,0,r4 541 VPERM(v8,v0,v7,v16) 542 lvx v6,r4,r9 543 VPERM(v9,v7,v6,v16) 544 lvx v5,r4,r10 545 VPERM(v10,v6,v5,v16) 546 lvx v4,r4,r11 547 VPERM(v11,v5,v4,v16) 548 lvx v3,r4,r12 549 VPERM(v12,v4,v3,v16) 550 lvx v2,r4,r14 551 VPERM(v13,v3,v2,v16) 552 lvx v1,r4,r15 553 VPERM(v14,v2,v1,v16) 554 lvx v0,r4,r16 555 VPERM(v15,v1,v0,v16) 556 addi r4,r4,128 557 stvx v8,0,r3 558 stvx v9,r3,r9 559 stvx v10,r3,r10 560 stvx v11,r3,r11 561 stvx v12,r3,r12 562 stvx v13,r3,r14 563 stvx v14,r3,r15 564 stvx v15,r3,r16 565 addi r3,r3,128 566 bdnz 8b 567 568 ld r14,STK_REG(R14)(r1) 569 ld r15,STK_REG(R15)(r1) 570 ld r16,STK_REG(R16)(r1) 571 572 /* Up to 127B to go */ 573 clrldi r5,r5,(64-7) 574 srdi r6,r5,4 575 mtocrf 0x01,r6 576 577 bf cr7*4+1,9f 578 lvx v3,0,r4 579 VPERM(v8,v0,v3,v16) 580 lvx v2,r4,r9 581 VPERM(v9,v3,v2,v16) 582 lvx v1,r4,r10 583 VPERM(v10,v2,v1,v16) 584 lvx v0,r4,r11 585 VPERM(v11,v1,v0,v16) 586 addi r4,r4,64 587 stvx v8,0,r3 588 stvx v9,r3,r9 589 stvx v10,r3,r10 590 stvx v11,r3,r11 591 addi r3,r3,64 592 5939: bf cr7*4+2,10f 594 lvx v1,0,r4 595 VPERM(v8,v0,v1,v16) 596 lvx v0,r4,r9 597 VPERM(v9,v1,v0,v16) 598 addi r4,r4,32 599 stvx v8,0,r3 600 stvx v9,r3,r9 601 addi r3,r3,32 602 60310: bf cr7*4+3,11f 604 lvx v1,0,r4 605 VPERM(v8,v0,v1,v16) 606 addi r4,r4,16 607 stvx v8,0,r3 608 addi r3,r3,16 609 610 /* Up to 15B to go */ 61111: clrldi r5,r5,(64-4) 612 addi r4,r4,-16 /* Unwind the +16 load offset */ 613 mtocrf 0x01,r5 614 bf cr7*4+0,12f 615 lwz r0,0(r4) /* Less chance of a reject with word ops */ 616 lwz r6,4(r4) 617 addi r4,r4,8 618 stw r0,0(r3) 619 stw r6,4(r3) 620 addi r3,r3,8 621 62212: bf cr7*4+1,13f 623 lwz r0,0(r4) 624 addi r4,r4,4 625 stw r0,0(r3) 626 addi r3,r3,4 627 62813: bf cr7*4+2,14f 629 lhz r0,0(r4) 630 addi r4,r4,2 631 sth r0,0(r3) 632 addi r3,r3,2 633 63414: bf cr7*4+3,15f 635 lbz r0,0(r4) 636 stb r0,0(r3) 637 63815: addi r1,r1,STACKFRAMESIZE 639 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 640 b exit_vmx_ops /* tail call optimise */ 641#endif /* CONFIG_ALTIVEC */