copyuser_power7.S (12150B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * 4 * Copyright (C) IBM Corporation, 2011 5 * 6 * Author: Anton Blanchard <anton@au.ibm.com> 7 */ 8#include <asm/ppc_asm.h> 9 10#ifndef SELFTEST_CASE 11/* 0 == don't use VMX, 1 == use VMX */ 12#define SELFTEST_CASE 0 13#endif 14 15#ifdef __BIG_ENDIAN__ 16#define LVS(VRT,RA,RB) lvsl VRT,RA,RB 17#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 18#else 19#define LVS(VRT,RA,RB) lvsr VRT,RA,RB 20#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 21#endif 22 23 .macro err1 24100: 25 EX_TABLE(100b,.Ldo_err1) 26 .endm 27 28 .macro err2 29200: 30 EX_TABLE(200b,.Ldo_err2) 31 .endm 32 33#ifdef CONFIG_ALTIVEC 34 .macro err3 35300: 36 EX_TABLE(300b,.Ldo_err3) 37 .endm 38 39 .macro err4 40400: 41 EX_TABLE(400b,.Ldo_err4) 42 .endm 43 44 45.Ldo_err4: 46 ld r16,STK_REG(R16)(r1) 47 ld r15,STK_REG(R15)(r1) 48 ld r14,STK_REG(R14)(r1) 49.Ldo_err3: 50 bl exit_vmx_usercopy 51 ld r0,STACKFRAMESIZE+16(r1) 52 mtlr r0 53 b .Lexit 54#endif /* CONFIG_ALTIVEC */ 55 56.Ldo_err2: 57 ld r22,STK_REG(R22)(r1) 58 ld r21,STK_REG(R21)(r1) 59 ld r20,STK_REG(R20)(r1) 60 ld r19,STK_REG(R19)(r1) 61 ld r18,STK_REG(R18)(r1) 62 ld r17,STK_REG(R17)(r1) 63 ld r16,STK_REG(R16)(r1) 64 ld r15,STK_REG(R15)(r1) 65 ld r14,STK_REG(R14)(r1) 66.Lexit: 67 addi r1,r1,STACKFRAMESIZE 68.Ldo_err1: 69 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 70 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 71 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 72 b __copy_tofrom_user_base 73 74 75_GLOBAL(__copy_tofrom_user_power7) 76 cmpldi r5,16 77 cmpldi cr1,r5,3328 78 79 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 80 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 81 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 82 83 blt .Lshort_copy 84 85#ifdef CONFIG_ALTIVEC 86test_feature = SELFTEST_CASE 87BEGIN_FTR_SECTION 88 bgt cr1,.Lvmx_copy 89END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) 90#endif 91 92.Lnonvmx_copy: 93 /* Get the source 8B aligned */ 94 neg r6,r4 95 mtocrf 0x01,r6 96 clrldi r6,r6,(64-3) 97 98 bf cr7*4+3,1f 99err1; lbz r0,0(r4) 100 addi r4,r4,1 101err1; stb r0,0(r3) 102 addi r3,r3,1 103 1041: bf cr7*4+2,2f 105err1; lhz r0,0(r4) 106 addi r4,r4,2 107err1; sth r0,0(r3) 108 addi r3,r3,2 109 1102: bf cr7*4+1,3f 111err1; lwz r0,0(r4) 112 addi r4,r4,4 113err1; stw r0,0(r3) 114 addi r3,r3,4 115 1163: sub r5,r5,r6 117 cmpldi r5,128 118 blt 5f 119 120 mflr r0 121 stdu r1,-STACKFRAMESIZE(r1) 122 std r14,STK_REG(R14)(r1) 123 std r15,STK_REG(R15)(r1) 124 std r16,STK_REG(R16)(r1) 125 std r17,STK_REG(R17)(r1) 126 std r18,STK_REG(R18)(r1) 127 std r19,STK_REG(R19)(r1) 128 std r20,STK_REG(R20)(r1) 129 std r21,STK_REG(R21)(r1) 130 std r22,STK_REG(R22)(r1) 131 std r0,STACKFRAMESIZE+16(r1) 132 133 srdi r6,r5,7 134 mtctr r6 135 136 /* Now do cacheline (128B) sized loads and stores. */ 137 .align 5 1384: 139err2; ld r0,0(r4) 140err2; ld r6,8(r4) 141err2; ld r7,16(r4) 142err2; ld r8,24(r4) 143err2; ld r9,32(r4) 144err2; ld r10,40(r4) 145err2; ld r11,48(r4) 146err2; ld r12,56(r4) 147err2; ld r14,64(r4) 148err2; ld r15,72(r4) 149err2; ld r16,80(r4) 150err2; ld r17,88(r4) 151err2; ld r18,96(r4) 152err2; ld r19,104(r4) 153err2; ld r20,112(r4) 154err2; ld r21,120(r4) 155 addi r4,r4,128 156err2; std r0,0(r3) 157err2; std r6,8(r3) 158err2; std r7,16(r3) 159err2; std r8,24(r3) 160err2; std r9,32(r3) 161err2; std r10,40(r3) 162err2; std r11,48(r3) 163err2; std r12,56(r3) 164err2; std r14,64(r3) 165err2; std r15,72(r3) 166err2; std r16,80(r3) 167err2; std r17,88(r3) 168err2; std r18,96(r3) 169err2; std r19,104(r3) 170err2; std r20,112(r3) 171err2; std r21,120(r3) 172 addi r3,r3,128 173 bdnz 4b 174 175 clrldi r5,r5,(64-7) 176 177 ld r14,STK_REG(R14)(r1) 178 ld r15,STK_REG(R15)(r1) 179 ld r16,STK_REG(R16)(r1) 180 ld r17,STK_REG(R17)(r1) 181 ld r18,STK_REG(R18)(r1) 182 ld r19,STK_REG(R19)(r1) 183 ld r20,STK_REG(R20)(r1) 184 ld r21,STK_REG(R21)(r1) 185 ld r22,STK_REG(R22)(r1) 186 addi r1,r1,STACKFRAMESIZE 187 188 /* Up to 127B to go */ 1895: srdi r6,r5,4 190 mtocrf 0x01,r6 191 1926: bf cr7*4+1,7f 193err1; ld r0,0(r4) 194err1; ld r6,8(r4) 195err1; ld r7,16(r4) 196err1; ld r8,24(r4) 197err1; ld r9,32(r4) 198err1; ld r10,40(r4) 199err1; ld r11,48(r4) 200err1; ld r12,56(r4) 201 addi r4,r4,64 202err1; std r0,0(r3) 203err1; std r6,8(r3) 204err1; std r7,16(r3) 205err1; std r8,24(r3) 206err1; std r9,32(r3) 207err1; std r10,40(r3) 208err1; std r11,48(r3) 209err1; std r12,56(r3) 210 addi r3,r3,64 211 212 /* Up to 63B to go */ 2137: bf cr7*4+2,8f 214err1; ld r0,0(r4) 215err1; ld r6,8(r4) 216err1; ld r7,16(r4) 217err1; ld r8,24(r4) 218 addi r4,r4,32 219err1; std r0,0(r3) 220err1; std r6,8(r3) 221err1; std r7,16(r3) 222err1; std r8,24(r3) 223 addi r3,r3,32 224 225 /* Up to 31B to go */ 2268: bf cr7*4+3,9f 227err1; ld r0,0(r4) 228err1; ld r6,8(r4) 229 addi r4,r4,16 230err1; std r0,0(r3) 231err1; std r6,8(r3) 232 addi r3,r3,16 233 2349: clrldi r5,r5,(64-4) 235 236 /* Up to 15B to go */ 237.Lshort_copy: 238 mtocrf 0x01,r5 239 bf cr7*4+0,12f 240err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ 241err1; lwz r6,4(r4) 242 addi r4,r4,8 243err1; stw r0,0(r3) 244err1; stw r6,4(r3) 245 addi r3,r3,8 246 24712: bf cr7*4+1,13f 248err1; lwz r0,0(r4) 249 addi r4,r4,4 250err1; stw r0,0(r3) 251 addi r3,r3,4 252 25313: bf cr7*4+2,14f 254err1; lhz r0,0(r4) 255 addi r4,r4,2 256err1; sth r0,0(r3) 257 addi r3,r3,2 258 25914: bf cr7*4+3,15f 260err1; lbz r0,0(r4) 261err1; stb r0,0(r3) 262 26315: li r3,0 264 blr 265 266.Lunwind_stack_nonvmx_copy: 267 addi r1,r1,STACKFRAMESIZE 268 b .Lnonvmx_copy 269 270.Lvmx_copy: 271#ifdef CONFIG_ALTIVEC 272 mflr r0 273 std r0,16(r1) 274 stdu r1,-STACKFRAMESIZE(r1) 275 bl enter_vmx_usercopy 276 cmpwi cr1,r3,0 277 ld r0,STACKFRAMESIZE+16(r1) 278 ld r3,STK_REG(R31)(r1) 279 ld r4,STK_REG(R30)(r1) 280 ld r5,STK_REG(R29)(r1) 281 mtlr r0 282 283 /* 284 * We prefetch both the source and destination using enhanced touch 285 * instructions. We use a stream ID of 0 for the load side and 286 * 1 for the store side. 287 */ 288 clrrdi r6,r4,7 289 clrrdi r9,r3,7 290 ori r9,r9,1 /* stream=1 */ 291 292 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 293 cmpldi r7,0x3FF 294 ble 1f 295 li r7,0x3FF 2961: lis r0,0x0E00 /* depth=7 */ 297 sldi r7,r7,7 298 or r7,r7,r0 299 ori r10,r7,1 /* stream=1 */ 300 301 lis r8,0x8000 /* GO=1 */ 302 clrldi r8,r8,32 303 304 /* setup read stream 0 */ 305 dcbt 0,r6,0b01000 /* addr from */ 306 dcbt 0,r7,0b01010 /* length and depth from */ 307 /* setup write stream 1 */ 308 dcbtst 0,r9,0b01000 /* addr to */ 309 dcbtst 0,r10,0b01010 /* length and depth to */ 310 eieio 311 dcbt 0,r8,0b01010 /* all streams GO */ 312 313 beq cr1,.Lunwind_stack_nonvmx_copy 314 315 /* 316 * If source and destination are not relatively aligned we use a 317 * slower permute loop. 318 */ 319 xor r6,r4,r3 320 rldicl. r6,r6,0,(64-4) 321 bne .Lvmx_unaligned_copy 322 323 /* Get the destination 16B aligned */ 324 neg r6,r3 325 mtocrf 0x01,r6 326 clrldi r6,r6,(64-4) 327 328 bf cr7*4+3,1f 329err3; lbz r0,0(r4) 330 addi r4,r4,1 331err3; stb r0,0(r3) 332 addi r3,r3,1 333 3341: bf cr7*4+2,2f 335err3; lhz r0,0(r4) 336 addi r4,r4,2 337err3; sth r0,0(r3) 338 addi r3,r3,2 339 3402: bf cr7*4+1,3f 341err3; lwz r0,0(r4) 342 addi r4,r4,4 343err3; stw r0,0(r3) 344 addi r3,r3,4 345 3463: bf cr7*4+0,4f 347err3; ld r0,0(r4) 348 addi r4,r4,8 349err3; std r0,0(r3) 350 addi r3,r3,8 351 3524: sub r5,r5,r6 353 354 /* Get the desination 128B aligned */ 355 neg r6,r3 356 srdi r7,r6,4 357 mtocrf 0x01,r7 358 clrldi r6,r6,(64-7) 359 360 li r9,16 361 li r10,32 362 li r11,48 363 364 bf cr7*4+3,5f 365err3; lvx v1,0,r4 366 addi r4,r4,16 367err3; stvx v1,0,r3 368 addi r3,r3,16 369 3705: bf cr7*4+2,6f 371err3; lvx v1,0,r4 372err3; lvx v0,r4,r9 373 addi r4,r4,32 374err3; stvx v1,0,r3 375err3; stvx v0,r3,r9 376 addi r3,r3,32 377 3786: bf cr7*4+1,7f 379err3; lvx v3,0,r4 380err3; lvx v2,r4,r9 381err3; lvx v1,r4,r10 382err3; lvx v0,r4,r11 383 addi r4,r4,64 384err3; stvx v3,0,r3 385err3; stvx v2,r3,r9 386err3; stvx v1,r3,r10 387err3; stvx v0,r3,r11 388 addi r3,r3,64 389 3907: sub r5,r5,r6 391 srdi r6,r5,7 392 393 std r14,STK_REG(R14)(r1) 394 std r15,STK_REG(R15)(r1) 395 std r16,STK_REG(R16)(r1) 396 397 li r12,64 398 li r14,80 399 li r15,96 400 li r16,112 401 402 mtctr r6 403 404 /* 405 * Now do cacheline sized loads and stores. By this stage the 406 * cacheline stores are also cacheline aligned. 407 */ 408 .align 5 4098: 410err4; lvx v7,0,r4 411err4; lvx v6,r4,r9 412err4; lvx v5,r4,r10 413err4; lvx v4,r4,r11 414err4; lvx v3,r4,r12 415err4; lvx v2,r4,r14 416err4; lvx v1,r4,r15 417err4; lvx v0,r4,r16 418 addi r4,r4,128 419err4; stvx v7,0,r3 420err4; stvx v6,r3,r9 421err4; stvx v5,r3,r10 422err4; stvx v4,r3,r11 423err4; stvx v3,r3,r12 424err4; stvx v2,r3,r14 425err4; stvx v1,r3,r15 426err4; stvx v0,r3,r16 427 addi r3,r3,128 428 bdnz 8b 429 430 ld r14,STK_REG(R14)(r1) 431 ld r15,STK_REG(R15)(r1) 432 ld r16,STK_REG(R16)(r1) 433 434 /* Up to 127B to go */ 435 clrldi r5,r5,(64-7) 436 srdi r6,r5,4 437 mtocrf 0x01,r6 438 439 bf cr7*4+1,9f 440err3; lvx v3,0,r4 441err3; lvx v2,r4,r9 442err3; lvx v1,r4,r10 443err3; lvx v0,r4,r11 444 addi r4,r4,64 445err3; stvx v3,0,r3 446err3; stvx v2,r3,r9 447err3; stvx v1,r3,r10 448err3; stvx v0,r3,r11 449 addi r3,r3,64 450 4519: bf cr7*4+2,10f 452err3; lvx v1,0,r4 453err3; lvx v0,r4,r9 454 addi r4,r4,32 455err3; stvx v1,0,r3 456err3; stvx v0,r3,r9 457 addi r3,r3,32 458 45910: bf cr7*4+3,11f 460err3; lvx v1,0,r4 461 addi r4,r4,16 462err3; stvx v1,0,r3 463 addi r3,r3,16 464 465 /* Up to 15B to go */ 46611: clrldi r5,r5,(64-4) 467 mtocrf 0x01,r5 468 bf cr7*4+0,12f 469err3; ld r0,0(r4) 470 addi r4,r4,8 471err3; std r0,0(r3) 472 addi r3,r3,8 473 47412: bf cr7*4+1,13f 475err3; lwz r0,0(r4) 476 addi r4,r4,4 477err3; stw r0,0(r3) 478 addi r3,r3,4 479 48013: bf cr7*4+2,14f 481err3; lhz r0,0(r4) 482 addi r4,r4,2 483err3; sth r0,0(r3) 484 addi r3,r3,2 485 48614: bf cr7*4+3,15f 487err3; lbz r0,0(r4) 488err3; stb r0,0(r3) 489 49015: addi r1,r1,STACKFRAMESIZE 491 b exit_vmx_usercopy /* tail call optimise */ 492 493.Lvmx_unaligned_copy: 494 /* Get the destination 16B aligned */ 495 neg r6,r3 496 mtocrf 0x01,r6 497 clrldi r6,r6,(64-4) 498 499 bf cr7*4+3,1f 500err3; lbz r0,0(r4) 501 addi r4,r4,1 502err3; stb r0,0(r3) 503 addi r3,r3,1 504 5051: bf cr7*4+2,2f 506err3; lhz r0,0(r4) 507 addi r4,r4,2 508err3; sth r0,0(r3) 509 addi r3,r3,2 510 5112: bf cr7*4+1,3f 512err3; lwz r0,0(r4) 513 addi r4,r4,4 514err3; stw r0,0(r3) 515 addi r3,r3,4 516 5173: bf cr7*4+0,4f 518err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 519err3; lwz r7,4(r4) 520 addi r4,r4,8 521err3; stw r0,0(r3) 522err3; stw r7,4(r3) 523 addi r3,r3,8 524 5254: sub r5,r5,r6 526 527 /* Get the desination 128B aligned */ 528 neg r6,r3 529 srdi r7,r6,4 530 mtocrf 0x01,r7 531 clrldi r6,r6,(64-7) 532 533 li r9,16 534 li r10,32 535 li r11,48 536 537 LVS(v16,0,r4) /* Setup permute control vector */ 538err3; lvx v0,0,r4 539 addi r4,r4,16 540 541 bf cr7*4+3,5f 542err3; lvx v1,0,r4 543 VPERM(v8,v0,v1,v16) 544 addi r4,r4,16 545err3; stvx v8,0,r3 546 addi r3,r3,16 547 vor v0,v1,v1 548 5495: bf cr7*4+2,6f 550err3; lvx v1,0,r4 551 VPERM(v8,v0,v1,v16) 552err3; lvx v0,r4,r9 553 VPERM(v9,v1,v0,v16) 554 addi r4,r4,32 555err3; stvx v8,0,r3 556err3; stvx v9,r3,r9 557 addi r3,r3,32 558 5596: bf cr7*4+1,7f 560err3; lvx v3,0,r4 561 VPERM(v8,v0,v3,v16) 562err3; lvx v2,r4,r9 563 VPERM(v9,v3,v2,v16) 564err3; lvx v1,r4,r10 565 VPERM(v10,v2,v1,v16) 566err3; lvx v0,r4,r11 567 VPERM(v11,v1,v0,v16) 568 addi r4,r4,64 569err3; stvx v8,0,r3 570err3; stvx v9,r3,r9 571err3; stvx v10,r3,r10 572err3; stvx v11,r3,r11 573 addi r3,r3,64 574 5757: sub r5,r5,r6 576 srdi r6,r5,7 577 578 std r14,STK_REG(R14)(r1) 579 std r15,STK_REG(R15)(r1) 580 std r16,STK_REG(R16)(r1) 581 582 li r12,64 583 li r14,80 584 li r15,96 585 li r16,112 586 587 mtctr r6 588 589 /* 590 * Now do cacheline sized loads and stores. By this stage the 591 * cacheline stores are also cacheline aligned. 592 */ 593 .align 5 5948: 595err4; lvx v7,0,r4 596 VPERM(v8,v0,v7,v16) 597err4; lvx v6,r4,r9 598 VPERM(v9,v7,v6,v16) 599err4; lvx v5,r4,r10 600 VPERM(v10,v6,v5,v16) 601err4; lvx v4,r4,r11 602 VPERM(v11,v5,v4,v16) 603err4; lvx v3,r4,r12 604 VPERM(v12,v4,v3,v16) 605err4; lvx v2,r4,r14 606 VPERM(v13,v3,v2,v16) 607err4; lvx v1,r4,r15 608 VPERM(v14,v2,v1,v16) 609err4; lvx v0,r4,r16 610 VPERM(v15,v1,v0,v16) 611 addi r4,r4,128 612err4; stvx v8,0,r3 613err4; stvx v9,r3,r9 614err4; stvx v10,r3,r10 615err4; stvx v11,r3,r11 616err4; stvx v12,r3,r12 617err4; stvx v13,r3,r14 618err4; stvx v14,r3,r15 619err4; stvx v15,r3,r16 620 addi r3,r3,128 621 bdnz 8b 622 623 ld r14,STK_REG(R14)(r1) 624 ld r15,STK_REG(R15)(r1) 625 ld r16,STK_REG(R16)(r1) 626 627 /* Up to 127B to go */ 628 clrldi r5,r5,(64-7) 629 srdi r6,r5,4 630 mtocrf 0x01,r6 631 632 bf cr7*4+1,9f 633err3; lvx v3,0,r4 634 VPERM(v8,v0,v3,v16) 635err3; lvx v2,r4,r9 636 VPERM(v9,v3,v2,v16) 637err3; lvx v1,r4,r10 638 VPERM(v10,v2,v1,v16) 639err3; lvx v0,r4,r11 640 VPERM(v11,v1,v0,v16) 641 addi r4,r4,64 642err3; stvx v8,0,r3 643err3; stvx v9,r3,r9 644err3; stvx v10,r3,r10 645err3; stvx v11,r3,r11 646 addi r3,r3,64 647 6489: bf cr7*4+2,10f 649err3; lvx v1,0,r4 650 VPERM(v8,v0,v1,v16) 651err3; lvx v0,r4,r9 652 VPERM(v9,v1,v0,v16) 653 addi r4,r4,32 654err3; stvx v8,0,r3 655err3; stvx v9,r3,r9 656 addi r3,r3,32 657 65810: bf cr7*4+3,11f 659err3; lvx v1,0,r4 660 VPERM(v8,v0,v1,v16) 661 addi r4,r4,16 662err3; stvx v8,0,r3 663 addi r3,r3,16 664 665 /* Up to 15B to go */ 66611: clrldi r5,r5,(64-4) 667 addi r4,r4,-16 /* Unwind the +16 load offset */ 668 mtocrf 0x01,r5 669 bf cr7*4+0,12f 670err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 671err3; lwz r6,4(r4) 672 addi r4,r4,8 673err3; stw r0,0(r3) 674err3; stw r6,4(r3) 675 addi r3,r3,8 676 67712: bf cr7*4+1,13f 678err3; lwz r0,0(r4) 679 addi r4,r4,4 680err3; stw r0,0(r3) 681 addi r3,r3,4 682 68313: bf cr7*4+2,14f 684err3; lhz r0,0(r4) 685 addi r4,r4,2 686err3; sth r0,0(r3) 687 addi r3,r3,2 688 68914: bf cr7*4+3,15f 690err3; lbz r0,0(r4) 691err3; stb r0,0(r3) 692 69315: addi r1,r1,STACKFRAMESIZE 694 b exit_vmx_usercopy /* tail call optimise */ 695#endif /* CONFIG_ALTIVEC */