memcmp_64.S (11692B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Author: Anton Blanchard <anton@au.ibm.com> 4 * Copyright 2015 IBM Corporation. 5 */ 6#include <asm/ppc_asm.h> 7#include <asm/export.h> 8#include <asm/ppc-opcode.h> 9 10#define off8 r6 11#define off16 r7 12#define off24 r8 13 14#define rA r9 15#define rB r10 16#define rC r11 17#define rD r27 18#define rE r28 19#define rF r29 20#define rG r30 21#define rH r31 22 23#ifdef __LITTLE_ENDIAN__ 24#define LH lhbrx 25#define LW lwbrx 26#define LD ldbrx 27#define LVS lvsr 28#define VPERM(_VRT,_VRA,_VRB,_VRC) \ 29 vperm _VRT,_VRB,_VRA,_VRC 30#else 31#define LH lhzx 32#define LW lwzx 33#define LD ldx 34#define LVS lvsl 35#define VPERM(_VRT,_VRA,_VRB,_VRC) \ 36 vperm _VRT,_VRA,_VRB,_VRC 37#endif 38 39#define VMX_THRESH 4096 40#define ENTER_VMX_OPS \ 41 mflr r0; \ 42 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 43 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 44 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 45 std r0,16(r1); \ 46 stdu r1,-STACKFRAMESIZE(r1); \ 47 bl enter_vmx_ops; \ 48 cmpwi cr1,r3,0; \ 49 ld r0,STACKFRAMESIZE+16(r1); \ 50 ld r3,STK_REG(R31)(r1); \ 51 ld r4,STK_REG(R30)(r1); \ 52 ld r5,STK_REG(R29)(r1); \ 53 addi r1,r1,STACKFRAMESIZE; \ 54 mtlr r0 55 56#define EXIT_VMX_OPS \ 57 mflr r0; \ 58 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ 59 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ 60 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ 61 std r0,16(r1); \ 62 stdu r1,-STACKFRAMESIZE(r1); \ 63 bl exit_vmx_ops; \ 64 ld r0,STACKFRAMESIZE+16(r1); \ 65 ld r3,STK_REG(R31)(r1); \ 66 ld r4,STK_REG(R30)(r1); \ 67 ld r5,STK_REG(R29)(r1); \ 68 addi r1,r1,STACKFRAMESIZE; \ 69 mtlr r0 70 71/* 72 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with 73 * 16 bytes boundary and permute the result with the 1st 16 bytes. 74 75 * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | 76 * ^ ^ ^ 77 * 0xbbbb10 0xbbbb20 0xbbb30 78 * ^ 79 * _vaddr 80 * 81 * 82 * _vmask is the mask generated by LVS 83 * _v1st_qw is the 1st aligned QW of current addr which is already loaded. 84 * for example: 0xyyyyyyyyyyyyy012 for big endian 85 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. 86 * for example: 0x3456789abcdefzzz for big endian 87 * The permute result is saved in _v_res. 88 * for example: 0x0123456789abcdef for big endian. 89 */ 90#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ 91 lvx _v2nd_qw,_vaddr,off16; \ 92 VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) 93 94/* 95 * There are 2 categories for memcmp: 96 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers 97 * are named like .Lsameoffset_xxxx 98 * 2) src/dst has different offset to the 8 bytes boundary. The handlers 99 * are named like .Ldiffoffset_xxxx 100 */ 101_GLOBAL_TOC(memcmp) 102 cmpdi cr1,r5,0 103 104 /* Use the short loop if the src/dst addresses are not 105 * with the same offset of 8 bytes align boundary. 106 */ 107 xor r6,r3,r4 108 andi. r6,r6,7 109 110 /* Fall back to short loop if compare at aligned addrs 111 * with less than 8 bytes. 112 */ 113 cmpdi cr6,r5,7 114 115 beq cr1,.Lzero 116 bgt cr6,.Lno_short 117 118.Lshort: 119 mtctr r5 1201: lbz rA,0(r3) 121 lbz rB,0(r4) 122 subf. rC,rB,rA 123 bne .Lnon_zero 124 bdz .Lzero 125 126 lbz rA,1(r3) 127 lbz rB,1(r4) 128 subf. rC,rB,rA 129 bne .Lnon_zero 130 bdz .Lzero 131 132 lbz rA,2(r3) 133 lbz rB,2(r4) 134 subf. rC,rB,rA 135 bne .Lnon_zero 136 bdz .Lzero 137 138 lbz rA,3(r3) 139 lbz rB,3(r4) 140 subf. rC,rB,rA 141 bne .Lnon_zero 142 143 addi r3,r3,4 144 addi r4,r4,4 145 146 bdnz 1b 147 148.Lzero: 149 li r3,0 150 blr 151 152.Lno_short: 153 dcbt 0,r3 154 dcbt 0,r4 155 bne .Ldiffoffset_8bytes_make_align_start 156 157 158.Lsameoffset_8bytes_make_align_start: 159 /* attempt to compare bytes not aligned with 8 bytes so that 160 * rest comparison can run based on 8 bytes alignment. 161 */ 162 andi. r6,r3,7 163 164 /* Try to compare the first double word which is not 8 bytes aligned: 165 * load the first double word at (src & ~7UL) and shift left appropriate 166 * bits before comparision. 167 */ 168 rlwinm r6,r3,3,26,28 169 beq .Lsameoffset_8bytes_aligned 170 clrrdi r3,r3,3 171 clrrdi r4,r4,3 172 LD rA,0,r3 173 LD rB,0,r4 174 sld rA,rA,r6 175 sld rB,rB,r6 176 cmpld cr0,rA,rB 177 srwi r6,r6,3 178 bne cr0,.LcmpAB_lightweight 179 subfic r6,r6,8 180 subf. r5,r6,r5 181 addi r3,r3,8 182 addi r4,r4,8 183 beq .Lzero 184 185.Lsameoffset_8bytes_aligned: 186 /* now we are aligned with 8 bytes. 187 * Use .Llong loop if left cmp bytes are equal or greater than 32B. 188 */ 189 cmpdi cr6,r5,31 190 bgt cr6,.Llong 191 192.Lcmp_lt32bytes: 193 /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ 194 cmpdi cr5,r5,7 195 srdi r0,r5,3 196 ble cr5,.Lcmp_rest_lt8bytes 197 198 /* handle 8 ~ 31 bytes */ 199 clrldi r5,r5,61 200 mtctr r0 2012: 202 LD rA,0,r3 203 LD rB,0,r4 204 cmpld cr0,rA,rB 205 addi r3,r3,8 206 addi r4,r4,8 207 bne cr0,.LcmpAB_lightweight 208 bdnz 2b 209 210 cmpwi r5,0 211 beq .Lzero 212 213.Lcmp_rest_lt8bytes: 214 /* 215 * Here we have less than 8 bytes to compare. At least s1 is aligned to 216 * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a 217 * page boundary, otherwise we might read past the end of the buffer and 218 * trigger a page fault. We use 4K as the conservative minimum page 219 * size. If we detect that case we go to the byte-by-byte loop. 220 * 221 * Otherwise the next double word is loaded from s1 and s2, and shifted 222 * right to compare the appropriate bits. 223 */ 224 clrldi r6,r4,(64-12) // r6 = r4 & 0xfff 225 cmpdi r6,0xff8 226 bgt .Lshort 227 228 subfic r6,r5,8 229 slwi r6,r6,3 230 LD rA,0,r3 231 LD rB,0,r4 232 srd rA,rA,r6 233 srd rB,rB,r6 234 cmpld cr0,rA,rB 235 bne cr0,.LcmpAB_lightweight 236 b .Lzero 237 238.Lnon_zero: 239 mr r3,rC 240 blr 241 242.Llong: 243#ifdef CONFIG_ALTIVEC 244BEGIN_FTR_SECTION 245 /* Try to use vmx loop if length is equal or greater than 4K */ 246 cmpldi cr6,r5,VMX_THRESH 247 bge cr6,.Lsameoffset_vmx_cmp 248END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 249 250.Llong_novmx_cmp: 251#endif 252 /* At least s1 addr is aligned with 8 bytes */ 253 li off8,8 254 li off16,16 255 li off24,24 256 257 std r31,-8(r1) 258 std r30,-16(r1) 259 std r29,-24(r1) 260 std r28,-32(r1) 261 std r27,-40(r1) 262 263 srdi r0,r5,5 264 mtctr r0 265 andi. r5,r5,31 266 267 LD rA,0,r3 268 LD rB,0,r4 269 270 LD rC,off8,r3 271 LD rD,off8,r4 272 273 LD rE,off16,r3 274 LD rF,off16,r4 275 276 LD rG,off24,r3 277 LD rH,off24,r4 278 cmpld cr0,rA,rB 279 280 addi r3,r3,32 281 addi r4,r4,32 282 283 bdz .Lfirst32 284 285 LD rA,0,r3 286 LD rB,0,r4 287 cmpld cr1,rC,rD 288 289 LD rC,off8,r3 290 LD rD,off8,r4 291 cmpld cr6,rE,rF 292 293 LD rE,off16,r3 294 LD rF,off16,r4 295 cmpld cr7,rG,rH 296 bne cr0,.LcmpAB 297 298 LD rG,off24,r3 299 LD rH,off24,r4 300 cmpld cr0,rA,rB 301 bne cr1,.LcmpCD 302 303 addi r3,r3,32 304 addi r4,r4,32 305 306 bdz .Lsecond32 307 308 .balign 16 309 3101: LD rA,0,r3 311 LD rB,0,r4 312 cmpld cr1,rC,rD 313 bne cr6,.LcmpEF 314 315 LD rC,off8,r3 316 LD rD,off8,r4 317 cmpld cr6,rE,rF 318 bne cr7,.LcmpGH 319 320 LD rE,off16,r3 321 LD rF,off16,r4 322 cmpld cr7,rG,rH 323 bne cr0,.LcmpAB 324 325 LD rG,off24,r3 326 LD rH,off24,r4 327 cmpld cr0,rA,rB 328 bne cr1,.LcmpCD 329 330 addi r3,r3,32 331 addi r4,r4,32 332 333 bdnz 1b 334 335.Lsecond32: 336 cmpld cr1,rC,rD 337 bne cr6,.LcmpEF 338 339 cmpld cr6,rE,rF 340 bne cr7,.LcmpGH 341 342 cmpld cr7,rG,rH 343 bne cr0,.LcmpAB 344 345 bne cr1,.LcmpCD 346 bne cr6,.LcmpEF 347 bne cr7,.LcmpGH 348 349.Ltail: 350 ld r31,-8(r1) 351 ld r30,-16(r1) 352 ld r29,-24(r1) 353 ld r28,-32(r1) 354 ld r27,-40(r1) 355 356 cmpdi r5,0 357 beq .Lzero 358 b .Lshort 359 360.Lfirst32: 361 cmpld cr1,rC,rD 362 cmpld cr6,rE,rF 363 cmpld cr7,rG,rH 364 365 bne cr0,.LcmpAB 366 bne cr1,.LcmpCD 367 bne cr6,.LcmpEF 368 bne cr7,.LcmpGH 369 370 b .Ltail 371 372.LcmpAB: 373 li r3,1 374 bgt cr0,.Lout 375 li r3,-1 376 b .Lout 377 378.LcmpCD: 379 li r3,1 380 bgt cr1,.Lout 381 li r3,-1 382 b .Lout 383 384.LcmpEF: 385 li r3,1 386 bgt cr6,.Lout 387 li r3,-1 388 b .Lout 389 390.LcmpGH: 391 li r3,1 392 bgt cr7,.Lout 393 li r3,-1 394 395.Lout: 396 ld r31,-8(r1) 397 ld r30,-16(r1) 398 ld r29,-24(r1) 399 ld r28,-32(r1) 400 ld r27,-40(r1) 401 blr 402 403.LcmpAB_lightweight: /* skip NV GPRS restore */ 404 li r3,1 405 bgtlr 406 li r3,-1 407 blr 408 409#ifdef CONFIG_ALTIVEC 410.Lsameoffset_vmx_cmp: 411 /* Enter with src/dst addrs has the same offset with 8 bytes 412 * align boundary. 413 * 414 * There is an optimization based on following fact: memcmp() 415 * prones to fail early at the first 32 bytes. 416 * Before applying VMX instructions which will lead to 32x128bits 417 * VMX regs load/restore penalty, we compare the first 32 bytes 418 * so that we can catch the ~80% fail cases. 419 */ 420 421 li r0,4 422 mtctr r0 423.Lsameoffset_prechk_32B_loop: 424 LD rA,0,r3 425 LD rB,0,r4 426 cmpld cr0,rA,rB 427 addi r3,r3,8 428 addi r4,r4,8 429 bne cr0,.LcmpAB_lightweight 430 addi r5,r5,-8 431 bdnz .Lsameoffset_prechk_32B_loop 432 433 ENTER_VMX_OPS 434 beq cr1,.Llong_novmx_cmp 435 4363: 437 /* need to check whether r4 has the same offset with r3 438 * for 16 bytes boundary. 439 */ 440 xor r0,r3,r4 441 andi. r0,r0,0xf 442 bne .Ldiffoffset_vmx_cmp_start 443 444 /* len is no less than 4KB. Need to align with 16 bytes further. 445 */ 446 andi. rA,r3,8 447 LD rA,0,r3 448 beq 4f 449 LD rB,0,r4 450 cmpld cr0,rA,rB 451 addi r3,r3,8 452 addi r4,r4,8 453 addi r5,r5,-8 454 455 beq cr0,4f 456 /* save and restore cr0 */ 457 mfocrf r5,128 458 EXIT_VMX_OPS 459 mtocrf 128,r5 460 b .LcmpAB_lightweight 461 4624: 463 /* compare 32 bytes for each loop */ 464 srdi r0,r5,5 465 mtctr r0 466 clrldi r5,r5,59 467 li off16,16 468 469.balign 16 4705: 471 lvx v0,0,r3 472 lvx v1,0,r4 473 VCMPEQUD_RC(v0,v0,v1) 474 bnl cr6,7f 475 lvx v0,off16,r3 476 lvx v1,off16,r4 477 VCMPEQUD_RC(v0,v0,v1) 478 bnl cr6,6f 479 addi r3,r3,32 480 addi r4,r4,32 481 bdnz 5b 482 483 EXIT_VMX_OPS 484 cmpdi r5,0 485 beq .Lzero 486 b .Lcmp_lt32bytes 487 4886: 489 addi r3,r3,16 490 addi r4,r4,16 491 4927: 493 /* diff the last 16 bytes */ 494 EXIT_VMX_OPS 495 LD rA,0,r3 496 LD rB,0,r4 497 cmpld cr0,rA,rB 498 li off8,8 499 bne cr0,.LcmpAB_lightweight 500 501 LD rA,off8,r3 502 LD rB,off8,r4 503 cmpld cr0,rA,rB 504 bne cr0,.LcmpAB_lightweight 505 b .Lzero 506#endif 507 508.Ldiffoffset_8bytes_make_align_start: 509 /* now try to align s1 with 8 bytes */ 510 rlwinm r6,r3,3,26,28 511 beq .Ldiffoffset_align_s1_8bytes 512 513 clrrdi r3,r3,3 514 LD rA,0,r3 515 LD rB,0,r4 /* unaligned load */ 516 sld rA,rA,r6 517 srd rA,rA,r6 518 srd rB,rB,r6 519 cmpld cr0,rA,rB 520 srwi r6,r6,3 521 bne cr0,.LcmpAB_lightweight 522 523 subfic r6,r6,8 524 subf. r5,r6,r5 525 addi r3,r3,8 526 add r4,r4,r6 527 528 beq .Lzero 529 530.Ldiffoffset_align_s1_8bytes: 531 /* now s1 is aligned with 8 bytes. */ 532#ifdef CONFIG_ALTIVEC 533BEGIN_FTR_SECTION 534 /* only do vmx ops when the size equal or greater than 4K bytes */ 535 cmpdi cr5,r5,VMX_THRESH 536 bge cr5,.Ldiffoffset_vmx_cmp 537END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 538 539.Ldiffoffset_novmx_cmp: 540#endif 541 542 543 cmpdi cr5,r5,31 544 ble cr5,.Lcmp_lt32bytes 545 546#ifdef CONFIG_ALTIVEC 547 b .Llong_novmx_cmp 548#else 549 b .Llong 550#endif 551 552#ifdef CONFIG_ALTIVEC 553.Ldiffoffset_vmx_cmp: 554 /* perform a 32 bytes pre-checking before 555 * enable VMX operations. 556 */ 557 li r0,4 558 mtctr r0 559.Ldiffoffset_prechk_32B_loop: 560 LD rA,0,r3 561 LD rB,0,r4 562 cmpld cr0,rA,rB 563 addi r3,r3,8 564 addi r4,r4,8 565 bne cr0,.LcmpAB_lightweight 566 addi r5,r5,-8 567 bdnz .Ldiffoffset_prechk_32B_loop 568 569 ENTER_VMX_OPS 570 beq cr1,.Ldiffoffset_novmx_cmp 571 572.Ldiffoffset_vmx_cmp_start: 573 /* Firstly try to align r3 with 16 bytes */ 574 andi. r6,r3,0xf 575 li off16,16 576 beq .Ldiffoffset_vmx_s1_16bytes_align 577 578 LVS v3,0,r3 579 LVS v4,0,r4 580 581 lvx v5,0,r3 582 lvx v6,0,r4 583 LD_VSR_CROSS16B(r3,v3,v5,v7,v9) 584 LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 585 586 VCMPEQUB_RC(v7,v9,v10) 587 bnl cr6,.Ldiffoffset_vmx_diff_found 588 589 subfic r6,r6,16 590 subf r5,r6,r5 591 add r3,r3,r6 592 add r4,r4,r6 593 594.Ldiffoffset_vmx_s1_16bytes_align: 595 /* now s1 is aligned with 16 bytes */ 596 lvx v6,0,r4 597 LVS v4,0,r4 598 srdi r6,r5,5 /* loop for 32 bytes each */ 599 clrldi r5,r5,59 600 mtctr r6 601 602.balign 16 603.Ldiffoffset_vmx_32bytesloop: 604 /* the first qw of r4 was saved in v6 */ 605 lvx v9,0,r3 606 LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 607 VCMPEQUB_RC(v7,v9,v10) 608 vor v6,v8,v8 609 bnl cr6,.Ldiffoffset_vmx_diff_found 610 611 addi r3,r3,16 612 addi r4,r4,16 613 614 lvx v9,0,r3 615 LD_VSR_CROSS16B(r4,v4,v6,v8,v10) 616 VCMPEQUB_RC(v7,v9,v10) 617 vor v6,v8,v8 618 bnl cr6,.Ldiffoffset_vmx_diff_found 619 620 addi r3,r3,16 621 addi r4,r4,16 622 623 bdnz .Ldiffoffset_vmx_32bytesloop 624 625 EXIT_VMX_OPS 626 627 cmpdi r5,0 628 beq .Lzero 629 b .Lcmp_lt32bytes 630 631.Ldiffoffset_vmx_diff_found: 632 EXIT_VMX_OPS 633 /* anyway, the diff will appear in next 16 bytes */ 634 li r5,16 635 b .Lcmp_lt32bytes 636 637#endif 638EXPORT_SYMBOL(memcmp)