memcpy-sh4.S (15652B)
1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * "memcpy" implementation of SuperH 4 * 5 * Copyright (C) 1999 Niibe Yutaka 6 * Copyright (c) 2002 STMicroelectronics Ltd 7 * Modified from memcpy.S and micro-optimised for SH4 8 * Stuart Menefy (stuart.menefy@st.com) 9 * 10 */ 11#include <linux/linkage.h> 12 13/* 14 * void *memcpy(void *dst, const void *src, size_t n); 15 * 16 * It is assumed that there is no overlap between src and dst. 17 * If there is an overlap, then the results are undefined. 18 */ 19 20 ! 21 ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. 22 ! 23 24 ! Size is 16 or greater, and may have trailing bytes 25 26 .balign 32 27.Lcase1: 28 ! Read a long word and write a long word at once 29 ! At the start of each iteration, r7 contains last long load 30 add #-1,r5 ! 79 EX 31 mov r4,r2 ! 5 MT (0 cycles latency) 32 33 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) 34 add #-4,r5 ! 50 EX 35 36 add #7,r2 ! 79 EX 37 ! 38#ifdef CONFIG_CPU_LITTLE_ENDIAN 39 ! 6 cycles, 4 bytes per iteration 403: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK 41 mov r7, r3 ! 5 MT (latency=0) ! RQPO 42 43 cmp/hi r2,r0 ! 57 MT 44 shll16 r3 ! 103 EX 45 46 mov r1,r6 ! 5 MT (latency=0) 47 shll8 r3 ! 102 EX ! Oxxx 48 49 shlr8 r6 ! 106 EX ! xNML 50 mov r1, r7 ! 5 MT (latency=0) 51 52 or r6,r3 ! 82 EX ! ONML 53 bt/s 3b ! 109 BR 54 55 mov.l r3,@-r0 ! 30 LS 56#else 573: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN 58 mov r7,r3 ! 5 MT (latency=0) ! OPQR 59 60 cmp/hi r2,r0 ! 57 MT 61 shlr16 r3 ! 107 EX 62 63 shlr8 r3 ! 106 EX ! xxxO 64 mov r1,r6 ! 5 MT (latency=0) 65 66 shll8 r6 ! 102 EX ! LMNx 67 mov r1,r7 ! 5 MT (latency=0) 68 69 or r6,r3 ! 82 EX ! LMNO 70 bt/s 3b ! 109 BR 71 72 mov.l r3,@-r0 ! 30 LS 73#endif 74 ! Finally, copy a byte at once, if necessary 75 76 add #4,r5 ! 50 EX 77 cmp/eq r4,r0 ! 54 MT 78 79 add #-6,r2 ! 50 EX 80 bt 9f ! 109 BR 81 828: cmp/hi r2,r0 ! 57 MT 83 mov.b @(r0,r5),r1 ! 20 LS (latency=2) 84 85 bt/s 8b ! 109 BR 86 87 mov.b r1,@-r0 ! 29 LS 88 899: rts 90 nop 91 92 93 ! 94 ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... 95 ! 96 97 ! Size is 16 or greater, and may have trailing bytes 98 99 .balign 32 100.Lcase3: 101 ! Read a long word and write a long word at once 102 ! At the start of each iteration, r7 contains last long load 103 add #-3,r5 ! 79 EX 104 mov r4,r2 ! 5 MT (0 cycles latency) 105 106 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) 107 add #-4,r5 ! 50 EX 108 109 add #7,r2 ! 79 EX 110 ! 111#ifdef CONFIG_CPU_LITTLE_ENDIAN 112 ! 6 cycles, 4 bytes per iteration 1133: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK 114 mov r7, r3 ! 5 MT (latency=0) ! RQPO 115 116 cmp/hi r2,r0 ! 57 MT 117 shll8 r3 ! 102 EX ! QPOx 118 119 mov r1,r6 ! 5 MT (latency=0) 120 shlr16 r6 ! 107 EX 121 122 shlr8 r6 ! 106 EX ! xxxN 123 mov r1, r7 ! 5 MT (latency=0) 124 125 or r6,r3 ! 82 EX ! QPON 126 bt/s 3b ! 109 BR 127 128 mov.l r3,@-r0 ! 30 LS 129#else 1303: mov r7,r3 ! OPQR 131 shlr8 r3 ! xOPQ 132 mov.l @(r0,r5),r7 ! KLMN 133 mov r7,r6 134 shll16 r6 135 shll8 r6 ! Nxxx 136 or r6,r3 ! NOPQ 137 cmp/hi r2,r0 138 bt/s 3b 139 mov.l r3,@-r0 140#endif 141 142 ! Finally, copy a byte at once, if necessary 143 144 add #6,r5 ! 50 EX 145 cmp/eq r4,r0 ! 54 MT 146 147 add #-6,r2 ! 50 EX 148 bt 9f ! 109 BR 149 1508: cmp/hi r2,r0 ! 57 MT 151 mov.b @(r0,r5),r1 ! 20 LS (latency=2) 152 153 bt/s 8b ! 109 BR 154 155 mov.b r1,@-r0 ! 29 LS 156 1579: rts 158 nop 159 160ENTRY(memcpy) 161 162 ! Calculate the invariants which will be used in the remainder 163 ! of the code: 164 ! 165 ! r4 --> [ ... ] DST [ ... ] SRC 166 ! [ ... ] [ ... ] 167 ! : : 168 ! r0 --> [ ... ] r0+r5 --> [ ... ] 169 ! 170 ! 171 172 ! Short circuit the common case of src, dst and len being 32 bit aligned 173 ! and test for zero length move 174 175 mov r6, r0 ! 5 MT (0 cycle latency) 176 or r4, r0 ! 82 EX 177 178 or r5, r0 ! 82 EX 179 tst r6, r6 ! 86 MT 180 181 bt/s 99f ! 111 BR (zero len) 182 tst #3, r0 ! 87 MT 183 184 mov r4, r0 ! 5 MT (0 cycle latency) 185 add r6, r0 ! 49 EX 186 187 mov #16, r1 ! 6 EX 188 bt/s .Lcase00 ! 111 BR (aligned) 189 190 sub r4, r5 ! 75 EX 191 192 ! Arguments are not nicely long word aligned or zero len. 193 ! Check for small copies, and if so do a simple byte at a time copy. 194 ! 195 ! Deciding on an exact value of 'small' is not easy, as the point at which 196 ! using the optimised routines become worthwhile varies (these are the 197 ! cycle counts for differnet sizes using byte-at-a-time vs. optimised): 198 ! size byte-at-time long word byte 199 ! 16 42 39-40 46-50 50-55 200 ! 24 58 43-44 54-58 62-67 201 ! 36 82 49-50 66-70 80-85 202 ! However the penalty for getting it 'wrong' is much higher for long word 203 ! aligned data (and this is more common), so use a value of 16. 204 205 cmp/gt r6,r1 ! 56 MT 206 207 add #-1,r5 ! 50 EX 208 bf/s 6f ! 108 BR (not small) 209 210 mov r5, r3 ! 5 MT (latency=0) 211 shlr r6 ! 104 EX 212 213 mov.b @(r0,r5),r1 ! 20 LS (latency=2) 214 bf/s 4f ! 111 BR 215 216 add #-1,r3 ! 50 EX 217 tst r6, r6 ! 86 MT 218 219 bt/s 98f ! 110 BR 220 mov.b r1,@-r0 ! 29 LS 221 222 ! 4 cycles, 2 bytes per iteration 2233: mov.b @(r0,r5),r1 ! 20 LS (latency=2) 224 2254: mov.b @(r0,r3),r2 ! 20 LS (latency=2) 226 dt r6 ! 67 EX 227 228 mov.b r1,@-r0 ! 29 LS 229 bf/s 3b ! 111 BR 230 231 mov.b r2,@-r0 ! 29 LS 23298: 233 rts 234 nop 235 23699: rts 237 mov r4, r0 238 239 ! Size is not small, so its worthwhile looking for optimisations. 240 ! First align destination to a long word boundary. 241 ! 242 ! r5 = normal value -1 243 2446: tst #3, r0 ! 87 MT 245 mov #3, r3 ! 6 EX 246 247 bt/s 2f ! 111 BR 248 and r0,r3 ! 78 EX 249 250 ! 3 cycles, 1 byte per iteration 2511: dt r3 ! 67 EX 252 mov.b @(r0,r5),r1 ! 19 LS (latency=2) 253 254 add #-1, r6 ! 79 EX 255 bf/s 1b ! 109 BR 256 257 mov.b r1,@-r0 ! 28 LS 258 2592: add #1, r5 ! 79 EX 260 261 ! Now select the appropriate bulk transfer code based on relative 262 ! alignment of src and dst. 263 264 mov r0, r3 ! 5 MT (latency=0) 265 266 mov r5, r0 ! 5 MT (latency=0) 267 tst #1, r0 ! 87 MT 268 269 bf/s 1f ! 111 BR 270 mov #64, r7 ! 6 EX 271 272 ! bit 0 clear 273 274 cmp/ge r7, r6 ! 55 MT 275 276 bt/s 2f ! 111 BR 277 tst #2, r0 ! 87 MT 278 279 ! small 280 bt/s .Lcase0 281 mov r3, r0 282 283 bra .Lcase2 284 nop 285 286 ! big 2872: bt/s .Lcase0b 288 mov r3, r0 289 290 bra .Lcase2b 291 nop 292 293 ! bit 0 set 2941: tst #2, r0 ! 87 MT 295 296 bt/s .Lcase1 297 mov r3, r0 298 299 bra .Lcase3 300 nop 301 302 303 ! 304 ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR 305 ! 306 307 ! src, dst and size are all long word aligned 308 ! size is non-zero 309 310 .balign 32 311.Lcase00: 312 mov #64, r1 ! 6 EX 313 mov r5, r3 ! 5 MT (latency=0) 314 315 cmp/gt r6, r1 ! 56 MT 316 add #-4, r5 ! 50 EX 317 318 bf .Lcase00b ! 108 BR (big loop) 319 shlr2 r6 ! 105 EX 320 321 shlr r6 ! 104 EX 322 mov.l @(r0, r5), r1 ! 21 LS (latency=2) 323 324 bf/s 4f ! 111 BR 325 add #-8, r3 ! 50 EX 326 327 tst r6, r6 ! 86 MT 328 bt/s 5f ! 110 BR 329 330 mov.l r1,@-r0 ! 30 LS 331 332 ! 4 cycles, 2 long words per iteration 3333: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 334 3354: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 336 dt r6 ! 67 EX 337 338 mov.l r1, @-r0 ! 30 LS 339 bf/s 3b ! 109 BR 340 341 mov.l r2, @-r0 ! 30 LS 342 3435: rts 344 nop 345 346 347 ! Size is 16 or greater and less than 64, but may have trailing bytes 348 349 .balign 32 350.Lcase0: 351 add #-4, r5 ! 50 EX 352 mov r4, r7 ! 5 MT (latency=0) 353 354 mov.l @(r0, r5), r1 ! 21 LS (latency=2) 355 mov #4, r2 ! 6 EX 356 357 add #11, r7 ! 50 EX 358 tst r2, r6 ! 86 MT 359 360 mov r5, r3 ! 5 MT (latency=0) 361 bt/s 4f ! 111 BR 362 363 add #-4, r3 ! 50 EX 364 mov.l r1,@-r0 ! 30 LS 365 366 ! 4 cycles, 2 long words per iteration 3673: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 368 3694: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 370 cmp/hi r7, r0 371 372 mov.l r1, @-r0 ! 30 LS 373 bt/s 3b ! 109 BR 374 375 mov.l r2, @-r0 ! 30 LS 376 377 ! Copy the final 0-3 bytes 378 379 add #3,r5 ! 50 EX 380 381 cmp/eq r0, r4 ! 54 MT 382 add #-10, r7 ! 50 EX 383 384 bt 9f ! 110 BR 385 386 ! 3 cycles, 1 byte per iteration 3871: mov.b @(r0,r5),r1 ! 19 LS 388 cmp/hi r7,r0 ! 57 MT 389 390 bt/s 1b ! 111 BR 391 mov.b r1,@-r0 ! 28 LS 392 3939: rts 394 nop 395 396 ! Size is at least 64 bytes, so will be going round the big loop at least once. 397 ! 398 ! r2 = rounded up r4 399 ! r3 = rounded down r0 400 401 .balign 32 402.Lcase0b: 403 add #-4, r5 ! 50 EX 404 405.Lcase00b: 406 mov r0, r3 ! 5 MT (latency=0) 407 mov #(~0x1f), r1 ! 6 EX 408 409 and r1, r3 ! 78 EX 410 mov r4, r2 ! 5 MT (latency=0) 411 412 cmp/eq r3, r0 ! 54 MT 413 add #0x1f, r2 ! 50 EX 414 415 bt/s 1f ! 110 BR 416 and r1, r2 ! 78 EX 417 418 ! copy initial words until cache line aligned 419 420 mov.l @(r0, r5), r1 ! 21 LS (latency=2) 421 tst #4, r0 ! 87 MT 422 423 mov r5, r6 ! 5 MT (latency=0) 424 add #-4, r6 ! 50 EX 425 426 bt/s 4f ! 111 BR 427 add #8, r3 ! 50 EX 428 429 tst #0x18, r0 ! 87 MT 430 431 bt/s 1f ! 109 BR 432 mov.l r1,@-r0 ! 30 LS 433 434 ! 4 cycles, 2 long words per iteration 4353: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 436 4374: mov.l @(r0, r6), r7 ! 21 LS (latency=2) 438 cmp/eq r3, r0 ! 54 MT 439 440 mov.l r1, @-r0 ! 30 LS 441 bf/s 3b ! 109 BR 442 443 mov.l r7, @-r0 ! 30 LS 444 445 ! Copy the cache line aligned blocks 446 ! 447 ! In use: r0, r2, r4, r5 448 ! Scratch: r1, r3, r6, r7 449 ! 450 ! We could do this with the four scratch registers, but if src 451 ! and dest hit the same cache line, this will thrash, so make 452 ! use of additional registers. 453 ! 454 ! We also need r0 as a temporary (for movca), so 'undo' the invariant: 455 ! r5: src (was r0+r5) 456 ! r1: dest (was r0) 457 ! this can be reversed at the end, so we don't need to save any extra 458 ! state. 459 ! 4601: mov.l r8, @-r15 ! 30 LS 461 add r0, r5 ! 49 EX 462 463 mov.l r9, @-r15 ! 30 LS 464 mov r0, r1 ! 5 MT (latency=0) 465 466 mov.l r10, @-r15 ! 30 LS 467 add #-0x1c, r5 ! 50 EX 468 469 mov.l r11, @-r15 ! 30 LS 470 471 ! 16 cycles, 32 bytes per iteration 4722: mov.l @(0x00,r5),r0 ! 18 LS (latency=2) 473 add #-0x20, r1 ! 50 EX 474 mov.l @(0x04,r5),r3 ! 18 LS (latency=2) 475 mov.l @(0x08,r5),r6 ! 18 LS (latency=2) 476 mov.l @(0x0c,r5),r7 ! 18 LS (latency=2) 477 mov.l @(0x10,r5),r8 ! 18 LS (latency=2) 478 mov.l @(0x14,r5),r9 ! 18 LS (latency=2) 479 mov.l @(0x18,r5),r10 ! 18 LS (latency=2) 480 mov.l @(0x1c,r5),r11 ! 18 LS (latency=2) 481 movca.l r0,@r1 ! 40 LS (latency=3-7) 482 mov.l r3,@(0x04,r1) ! 33 LS 483 mov.l r6,@(0x08,r1) ! 33 LS 484 mov.l r7,@(0x0c,r1) ! 33 LS 485 486 mov.l r8,@(0x10,r1) ! 33 LS 487 add #-0x20, r5 ! 50 EX 488 489 mov.l r9,@(0x14,r1) ! 33 LS 490 cmp/eq r2,r1 ! 54 MT 491 492 mov.l r10,@(0x18,r1) ! 33 LS 493 bf/s 2b ! 109 BR 494 495 mov.l r11,@(0x1c,r1) ! 33 LS 496 497 mov r1, r0 ! 5 MT (latency=0) 498 499 mov.l @r15+, r11 ! 15 LS 500 sub r1, r5 ! 75 EX 501 502 mov.l @r15+, r10 ! 15 LS 503 cmp/eq r4, r0 ! 54 MT 504 505 bf/s 1f ! 109 BR 506 mov.l @r15+, r9 ! 15 LS 507 508 rts 5091: mov.l @r15+, r8 ! 15 LS 510 sub r4, r1 ! 75 EX (len remaining) 511 512 ! number of trailing bytes is non-zero 513 ! 514 ! invariants restored (r5 already decremented by 4) 515 ! also r1=num bytes remaining 516 517 mov #4, r2 ! 6 EX 518 mov r4, r7 ! 5 MT (latency=0) 519 520 add #0x1c, r5 ! 50 EX (back to -4) 521 cmp/hs r2, r1 ! 58 MT 522 523 bf/s 5f ! 108 BR 524 add #11, r7 ! 50 EX 525 526 mov.l @(r0, r5), r6 ! 21 LS (latency=2) 527 tst r2, r1 ! 86 MT 528 529 mov r5, r3 ! 5 MT (latency=0) 530 bt/s 4f ! 111 BR 531 532 add #-4, r3 ! 50 EX 533 cmp/hs r2, r1 ! 58 MT 534 535 bt/s 5f ! 111 BR 536 mov.l r6,@-r0 ! 30 LS 537 538 ! 4 cycles, 2 long words per iteration 5393: mov.l @(r0, r5), r6 ! 21 LS (latency=2) 540 5414: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 542 cmp/hi r7, r0 543 544 mov.l r6, @-r0 ! 30 LS 545 bt/s 3b ! 109 BR 546 547 mov.l r2, @-r0 ! 30 LS 548 549 ! Copy the final 0-3 bytes 550 5515: cmp/eq r0, r4 ! 54 MT 552 add #-10, r7 ! 50 EX 553 554 bt 9f ! 110 BR 555 add #3,r5 ! 50 EX 556 557 ! 3 cycles, 1 byte per iteration 5581: mov.b @(r0,r5),r1 ! 19 LS 559 cmp/hi r7,r0 ! 57 MT 560 561 bt/s 1b ! 111 BR 562 mov.b r1,@-r0 ! 28 LS 563 5649: rts 565 nop 566 567 ! 568 ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. 569 ! 570 571 .balign 32 572.Lcase2: 573 ! Size is 16 or greater and less then 64, but may have trailing bytes 574 5752: mov r5, r6 ! 5 MT (latency=0) 576 add #-2,r5 ! 50 EX 577 578 mov r4,r2 ! 5 MT (latency=0) 579 add #-4,r6 ! 50 EX 580 581 add #7,r2 ! 50 EX 5823: mov.w @(r0,r5),r1 ! 20 LS (latency=2) 583 584 mov.w @(r0,r6),r3 ! 20 LS (latency=2) 585 cmp/hi r2,r0 ! 57 MT 586 587 mov.w r1,@-r0 ! 29 LS 588 bt/s 3b ! 111 BR 589 590 mov.w r3,@-r0 ! 29 LS 591 592 bra 10f 593 nop 594 595 596 .balign 32 597.Lcase2b: 598 ! Size is at least 64 bytes, so will be going round the big loop at least once. 599 ! 600 ! r2 = rounded up r4 601 ! r3 = rounded down r0 602 603 mov r0, r3 ! 5 MT (latency=0) 604 mov #(~0x1f), r1 ! 6 EX 605 606 and r1, r3 ! 78 EX 607 mov r4, r2 ! 5 MT (latency=0) 608 609 cmp/eq r3, r0 ! 54 MT 610 add #0x1f, r2 ! 50 EX 611 612 add #-2, r5 ! 50 EX 613 bt/s 1f ! 110 BR 614 and r1, r2 ! 78 EX 615 616 ! Copy a short word one at a time until we are cache line aligned 617 ! Normal values: r0, r2, r3, r4 618 ! Unused: r1, r6, r7 619 ! Mod: r5 (=r5-2) 620 ! 621 add #2, r3 ! 50 EX 622 6232: mov.w @(r0,r5),r1 ! 20 LS (latency=2) 624 cmp/eq r3,r0 ! 54 MT 625 626 bf/s 2b ! 111 BR 627 628 mov.w r1,@-r0 ! 29 LS 629 630 ! Copy the cache line aligned blocks 631 ! 632 ! In use: r0, r2, r4, r5 (=r5-2) 633 ! Scratch: r1, r3, r6, r7 634 ! 635 ! We could do this with the four scratch registers, but if src 636 ! and dest hit the same cache line, this will thrash, so make 637 ! use of additional registers. 638 ! 639 ! We also need r0 as a temporary (for movca), so 'undo' the invariant: 640 ! r5: src (was r0+r5) 641 ! r1: dest (was r0) 642 ! this can be reversed at the end, so we don't need to save any extra 643 ! state. 644 ! 6451: mov.l r8, @-r15 ! 30 LS 646 add r0, r5 ! 49 EX 647 648 mov.l r9, @-r15 ! 30 LS 649 mov r0, r1 ! 5 MT (latency=0) 650 651 mov.l r10, @-r15 ! 30 LS 652 add #-0x1e, r5 ! 50 EX 653 654 mov.l r11, @-r15 ! 30 LS 655 656 mov.l r12, @-r15 ! 30 LS 657 658 ! 17 cycles, 32 bytes per iteration 659#ifdef CONFIG_CPU_LITTLE_ENDIAN 6602: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI 661 add #-0x20, r1 ! 50 EX 662 663 mov.l @r5+, r3 ! 15 LS (latency=2) NMLK 664 665 mov.l @r5+, r6 ! 15 LS (latency=2) RQPO 666 shll16 r0 ! 103 EX JI.. 667 668 mov.l @r5+, r7 ! 15 LS (latency=2) 669 xtrct r3, r0 ! 48 EX LKJI 670 671 mov.l @r5+, r8 ! 15 LS (latency=2) 672 xtrct r6, r3 ! 48 EX PONM 673 674 mov.l @r5+, r9 ! 15 LS (latency=2) 675 xtrct r7, r6 ! 48 EX 676 677 mov.l @r5+, r10 ! 15 LS (latency=2) 678 xtrct r8, r7 ! 48 EX 679 680 mov.l @r5+, r11 ! 15 LS (latency=2) 681 xtrct r9, r8 ! 48 EX 682 683 mov.w @r5+, r12 ! 15 LS (latency=2) 684 xtrct r10, r9 ! 48 EX 685 686 movca.l r0,@r1 ! 40 LS (latency=3-7) 687 xtrct r11, r10 ! 48 EX 688 689 mov.l r3, @(0x04,r1) ! 33 LS 690 xtrct r12, r11 ! 48 EX 691 692 mov.l r6, @(0x08,r1) ! 33 LS 693 694 mov.l r7, @(0x0c,r1) ! 33 LS 695 696 mov.l r8, @(0x10,r1) ! 33 LS 697 add #-0x40, r5 ! 50 EX 698 699 mov.l r9, @(0x14,r1) ! 33 LS 700 cmp/eq r2,r1 ! 54 MT 701 702 mov.l r10, @(0x18,r1) ! 33 LS 703 bf/s 2b ! 109 BR 704 705 mov.l r11, @(0x1c,r1) ! 33 LS 706#else 7072: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2) 708 add #-2, r5 ! 50 EX 709 710 mov.l @(0x1c,r5), r3 ! 18 LS (latency=2) 711 add #-4, r1 ! 50 EX 712 713 mov.l @(0x18,r5), r6 ! 18 LS (latency=2) 714 shll16 r0 ! 103 EX 715 716 mov.l @(0x14,r5), r7 ! 18 LS (latency=2) 717 xtrct r3, r0 ! 48 EX 718 719 mov.l @(0x10,r5), r8 ! 18 LS (latency=2) 720 xtrct r6, r3 ! 48 EX 721 722 mov.l @(0x0c,r5), r9 ! 18 LS (latency=2) 723 xtrct r7, r6 ! 48 EX 724 725 mov.l @(0x08,r5), r10 ! 18 LS (latency=2) 726 xtrct r8, r7 ! 48 EX 727 728 mov.l @(0x04,r5), r11 ! 18 LS (latency=2) 729 xtrct r9, r8 ! 48 EX 730 731 mov.l @(0x00,r5), r12 ! 18 LS (latency=2) 732 xtrct r10, r9 ! 48 EX 733 734 movca.l r0,@r1 ! 40 LS (latency=3-7) 735 add #-0x1c, r1 ! 50 EX 736 737 mov.l r3, @(0x18,r1) ! 33 LS 738 xtrct r11, r10 ! 48 EX 739 740 mov.l r6, @(0x14,r1) ! 33 LS 741 xtrct r12, r11 ! 48 EX 742 743 mov.l r7, @(0x10,r1) ! 33 LS 744 745 mov.l r8, @(0x0c,r1) ! 33 LS 746 add #-0x1e, r5 ! 50 EX 747 748 mov.l r9, @(0x08,r1) ! 33 LS 749 cmp/eq r2,r1 ! 54 MT 750 751 mov.l r10, @(0x04,r1) ! 33 LS 752 bf/s 2b ! 109 BR 753 754 mov.l r11, @(0x00,r1) ! 33 LS 755#endif 756 757 mov.l @r15+, r12 758 mov r1, r0 ! 5 MT (latency=0) 759 760 mov.l @r15+, r11 ! 15 LS 761 sub r1, r5 ! 75 EX 762 763 mov.l @r15+, r10 ! 15 LS 764 cmp/eq r4, r0 ! 54 MT 765 766 bf/s 1f ! 109 BR 767 mov.l @r15+, r9 ! 15 LS 768 769 rts 7701: mov.l @r15+, r8 ! 15 LS 771 772 add #0x1e, r5 ! 50 EX 773 774 ! Finish off a short word at a time 775 ! r5 must be invariant - 2 77610: mov r4,r2 ! 5 MT (latency=0) 777 add #1,r2 ! 50 EX 778 779 cmp/hi r2, r0 ! 57 MT 780 bf/s 1f ! 109 BR 781 782 add #2, r2 ! 50 EX 783 7843: mov.w @(r0,r5),r1 ! 20 LS 785 cmp/hi r2,r0 ! 57 MT 786 787 bt/s 3b ! 109 BR 788 789 mov.w r1,@-r0 ! 29 LS 7901: 791 792 ! 793 ! Finally, copy the last byte if necessary 794 cmp/eq r4,r0 ! 54 MT 795 bt/s 9b 796 add #1,r5 797 mov.b @(r0,r5),r1 798 rts 799 mov.b r1,@-r0 800