NGmemcpy.S (13373B)
1/* SPDX-License-Identifier: GPL-2.0 */ 2/* NGmemcpy.S: Niagara optimized memcpy. 3 * 4 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) 5 */ 6 7#ifdef __KERNEL__ 8#include <linux/linkage.h> 9#include <asm/asi.h> 10#include <asm/thread_info.h> 11#define GLOBAL_SPARE %g7 12#define RESTORE_ASI(TMP) \ 13 wr %g0, ASI_AIUS, %asi 14#else 15#define GLOBAL_SPARE %g5 16#define RESTORE_ASI(TMP) \ 17 wr %g0, ASI_PNF, %asi 18#endif 19 20#ifdef __sparc_v9__ 21#define SAVE_AMOUNT 128 22#else 23#define SAVE_AMOUNT 64 24#endif 25 26#ifndef STORE_ASI 27#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 28#endif 29 30#ifndef EX_LD 31#define EX_LD(x,y) x 32#endif 33 34#ifndef EX_ST 35#define EX_ST(x,y) x 36#endif 37 38#ifndef LOAD 39#ifndef MEMCPY_DEBUG 40#define LOAD(type,addr,dest) type [addr], dest 41#else 42#define LOAD(type,addr,dest) type##a [addr] 0x80, dest 43#endif 44#endif 45 46#ifndef LOAD_TWIN 47#define LOAD_TWIN(addr_reg,dest0,dest1) \ 48 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 49#endif 50 51#ifndef STORE 52#define STORE(type,src,addr) type src, [addr] 53#endif 54 55#ifndef STORE_INIT 56#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 57#define STORE_INIT(src,addr) stxa src, [addr] %asi 58#else 59#define STORE_INIT(src,addr) stx src, [addr + 0x00] 60#endif 61#endif 62 63#ifndef FUNC_NAME 64#define FUNC_NAME NGmemcpy 65#endif 66 67#ifndef PREAMBLE 68#define PREAMBLE 69#endif 70 71#ifndef XCC 72#define XCC xcc 73#endif 74 75 .register %g2,#scratch 76 .register %g3,#scratch 77 78 .text 79#ifndef EX_RETVAL 80#define EX_RETVAL(x) x 81__restore_asi: 82 ret 83 wr %g0, ASI_AIUS, %asi 84 restore 85ENTRY(NG_ret_i2_plus_i4_plus_1) 86 ba,pt %xcc, __restore_asi 87 add %i2, %i5, %i0 88ENDPROC(NG_ret_i2_plus_i4_plus_1) 89ENTRY(NG_ret_i2_plus_g1) 90 ba,pt %xcc, __restore_asi 91 add %i2, %g1, %i0 92ENDPROC(NG_ret_i2_plus_g1) 93ENTRY(NG_ret_i2_plus_g1_minus_8) 94 sub %g1, 8, %g1 95 ba,pt %xcc, __restore_asi 96 add %i2, %g1, %i0 97ENDPROC(NG_ret_i2_plus_g1_minus_8) 98ENTRY(NG_ret_i2_plus_g1_minus_16) 99 sub %g1, 16, %g1 100 ba,pt %xcc, __restore_asi 101 add %i2, %g1, %i0 102ENDPROC(NG_ret_i2_plus_g1_minus_16) 103ENTRY(NG_ret_i2_plus_g1_minus_24) 104 sub %g1, 24, %g1 105 ba,pt %xcc, __restore_asi 106 add %i2, %g1, %i0 107ENDPROC(NG_ret_i2_plus_g1_minus_24) 108ENTRY(NG_ret_i2_plus_g1_minus_32) 109 sub %g1, 32, %g1 110 ba,pt %xcc, __restore_asi 111 add %i2, %g1, %i0 112ENDPROC(NG_ret_i2_plus_g1_minus_32) 113ENTRY(NG_ret_i2_plus_g1_minus_40) 114 sub %g1, 40, %g1 115 ba,pt %xcc, __restore_asi 116 add %i2, %g1, %i0 117ENDPROC(NG_ret_i2_plus_g1_minus_40) 118ENTRY(NG_ret_i2_plus_g1_minus_48) 119 sub %g1, 48, %g1 120 ba,pt %xcc, __restore_asi 121 add %i2, %g1, %i0 122ENDPROC(NG_ret_i2_plus_g1_minus_48) 123ENTRY(NG_ret_i2_plus_g1_minus_56) 124 sub %g1, 56, %g1 125 ba,pt %xcc, __restore_asi 126 add %i2, %g1, %i0 127ENDPROC(NG_ret_i2_plus_g1_minus_56) 128ENTRY(NG_ret_i2_plus_i4) 129 ba,pt %xcc, __restore_asi 130 add %i2, %i4, %i0 131ENDPROC(NG_ret_i2_plus_i4) 132ENTRY(NG_ret_i2_plus_i4_minus_8) 133 sub %i4, 8, %i4 134 ba,pt %xcc, __restore_asi 135 add %i2, %i4, %i0 136ENDPROC(NG_ret_i2_plus_i4_minus_8) 137ENTRY(NG_ret_i2_plus_8) 138 ba,pt %xcc, __restore_asi 139 add %i2, 8, %i0 140ENDPROC(NG_ret_i2_plus_8) 141ENTRY(NG_ret_i2_plus_4) 142 ba,pt %xcc, __restore_asi 143 add %i2, 4, %i0 144ENDPROC(NG_ret_i2_plus_4) 145ENTRY(NG_ret_i2_plus_1) 146 ba,pt %xcc, __restore_asi 147 add %i2, 1, %i0 148ENDPROC(NG_ret_i2_plus_1) 149ENTRY(NG_ret_i2_plus_g1_plus_1) 150 add %g1, 1, %g1 151 ba,pt %xcc, __restore_asi 152 add %i2, %g1, %i0 153ENDPROC(NG_ret_i2_plus_g1_plus_1) 154ENTRY(NG_ret_i2) 155 ba,pt %xcc, __restore_asi 156 mov %i2, %i0 157ENDPROC(NG_ret_i2) 158ENTRY(NG_ret_i2_and_7_plus_i4) 159 and %i2, 7, %i2 160 ba,pt %xcc, __restore_asi 161 add %i2, %i4, %i0 162ENDPROC(NG_ret_i2_and_7_plus_i4) 163#endif 164 165 .align 64 166 167 .globl FUNC_NAME 168 .type FUNC_NAME,#function 169FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */ 170 PREAMBLE 171 save %sp, -SAVE_AMOUNT, %sp 172 srlx %i2, 31, %g2 173 cmp %g2, 0 174 tne %xcc, 5 175 mov %i0, %o0 176 cmp %i2, 0 177 be,pn %XCC, 85f 178 or %o0, %i1, %i3 179 cmp %i2, 16 180 blu,a,pn %XCC, 80f 181 or %i3, %i2, %i3 182 183 /* 2 blocks (128 bytes) is the minimum we can do the block 184 * copy with. We need to ensure that we'll iterate at least 185 * once in the block copy loop. At worst we'll need to align 186 * the destination to a 64-byte boundary which can chew up 187 * to (64 - 1) bytes from the length before we perform the 188 * block copy loop. 189 */ 190 cmp %i2, (2 * 64) 191 blu,pt %XCC, 70f 192 andcc %i3, 0x7, %g0 193 194 /* %o0: dst 195 * %i1: src 196 * %i2: len (known to be >= 128) 197 * 198 * The block copy loops will use %i4/%i5,%g2/%g3 as 199 * temporaries while copying the data. 200 */ 201 202 LOAD(prefetch, %i1, #one_read) 203 wr %g0, STORE_ASI, %asi 204 205 /* Align destination on 64-byte boundary. */ 206 andcc %o0, (64 - 1), %i4 207 be,pt %XCC, 2f 208 sub %i4, 64, %i4 209 sub %g0, %i4, %i4 ! bytes to align dst 210 sub %i2, %i4, %i2 2111: subcc %i4, 1, %i4 212 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1) 213 EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1) 214 add %i1, 1, %i1 215 bne,pt %XCC, 1b 216 add %o0, 1, %o0 217 218 /* If the source is on a 16-byte boundary we can do 219 * the direct block copy loop. If it is 8-byte aligned 220 * we can do the 16-byte loads offset by -8 bytes and the 221 * init stores offset by one register. 222 * 223 * If the source is not even 8-byte aligned, we need to do 224 * shifting and masking (basically integer faligndata). 225 * 226 * The careful bit with init stores is that if we store 227 * to any part of the cache line we have to store the whole 228 * cacheline else we can end up with corrupt L2 cache line 229 * contents. Since the loop works on 64-bytes of 64-byte 230 * aligned store data at a time, this is easy to ensure. 231 */ 2322: 233 andcc %i1, (16 - 1), %i4 234 andn %i2, (64 - 1), %g1 ! block copy loop iterator 235 be,pt %XCC, 50f 236 sub %i2, %g1, %i2 ! final sub-block copy bytes 237 238 cmp %i4, 8 239 be,pt %XCC, 10f 240 sub %i1, %i4, %i1 241 242 /* Neither 8-byte nor 16-byte aligned, shift and mask. */ 243 and %i4, 0x7, GLOBAL_SPARE 244 sll GLOBAL_SPARE, 3, GLOBAL_SPARE 245 mov 64, %i5 246 EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1) 247 sub %i5, GLOBAL_SPARE, %i5 248 mov 16, %o4 249 mov 32, %o5 250 mov 48, %o7 251 mov 64, %i3 252 253 bg,pn %XCC, 9f 254 nop 255 256#define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \ 257 sllx WORD1, POST_SHIFT, WORD1; \ 258 srlx WORD2, PRE_SHIFT, TMP; \ 259 sllx WORD2, POST_SHIFT, WORD2; \ 260 or WORD1, TMP, WORD1; \ 261 srlx WORD3, PRE_SHIFT, TMP; \ 262 or WORD2, TMP, WORD2; 263 2648: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) 265 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) 266 LOAD(prefetch, %i1 + %i3, #one_read) 267 268 EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1) 269 EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 270 271 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) 272 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) 273 274 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 275 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 276 277 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 278 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) 279 280 EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 281 EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 282 283 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) 284 add %i1, 64, %i1 285 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) 286 287 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 288 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 289 290 subcc %g1, 64, %g1 291 bne,pt %XCC, 8b 292 add %o0, 64, %o0 293 294 ba,pt %XCC, 60f 295 add %i1, %i4, %i1 296 2979: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) 298 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) 299 LOAD(prefetch, %i1 + %i3, #one_read) 300 301 EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1) 302 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 303 304 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) 305 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) 306 307 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 308 EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 309 310 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 311 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) 312 313 EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 314 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 315 316 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) 317 add %i1, 64, %i1 318 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) 319 320 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 321 EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 322 323 subcc %g1, 64, %g1 324 bne,pt %XCC, 9b 325 add %o0, 64, %o0 326 327 ba,pt %XCC, 60f 328 add %i1, %i4, %i1 329 33010: /* Destination is 64-byte aligned, source was only 8-byte 331 * aligned but it has been subtracted by 8 and we perform 332 * one twin load ahead, then add 8 back into source when 333 * we finish the loop. 334 */ 335 EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1) 336 mov 16, %o7 337 mov 32, %g2 338 mov 48, %g3 339 mov 64, %o1 3401: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) 341 LOAD(prefetch, %i1 + %o1, #one_read) 342 EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line 343 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 344 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) 345 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 346 EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 347 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 348 EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 349 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 350 EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48) 351 add %i1, 64, %i1 352 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 353 EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 354 subcc %g1, 64, %g1 355 bne,pt %XCC, 1b 356 add %o0, 64, %o0 357 358 ba,pt %XCC, 60f 359 add %i1, 0x8, %i1 360 36150: /* Destination is 64-byte aligned, and source is 16-byte 362 * aligned. 363 */ 364 mov 16, %o7 365 mov 32, %g2 366 mov 48, %g3 367 mov 64, %o1 3681: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1) 369 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) 370 LOAD(prefetch, %i1 + %o1, #one_read) 371 EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line 372 EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 373 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) 374 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 375 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 376 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 377 add %i1, 64, %i1 378 EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 379 EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 380 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 381 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 382 subcc %g1, 64, %g1 383 bne,pt %XCC, 1b 384 add %o0, 64, %o0 385 /* fall through */ 386 38760: 388 membar #Sync 389 390 /* %i2 contains any final bytes still needed to be copied 391 * over. If anything is left, we copy it one byte at a time. 392 */ 393 RESTORE_ASI(%i3) 394 brz,pt %i2, 85f 395 sub %o0, %i1, %i3 396 ba,a,pt %XCC, 90f 397 nop 398 399 .align 64 40070: /* 16 < len <= 64 */ 401 bne,pn %XCC, 75f 402 sub %o0, %i1, %i3 403 40472: 405 andn %i2, 0xf, %i4 406 and %i2, 0xf, %i2 4071: subcc %i4, 0x10, %i4 408 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4) 409 add %i1, 0x08, %i1 410 EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4) 411 sub %i1, 0x08, %i1 412 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4) 413 add %i1, 0x8, %i1 414 EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8) 415 bgu,pt %XCC, 1b 416 add %i1, 0x8, %i1 41773: andcc %i2, 0x8, %g0 418 be,pt %XCC, 1f 419 nop 420 sub %i2, 0x8, %i2 421 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8) 422 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8) 423 add %i1, 0x8, %i1 4241: andcc %i2, 0x4, %g0 425 be,pt %XCC, 1f 426 nop 427 sub %i2, 0x4, %i2 428 EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4) 429 EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4) 430 add %i1, 0x4, %i1 4311: cmp %i2, 0 432 be,pt %XCC, 85f 433 nop 434 ba,pt %xcc, 90f 435 nop 436 43775: 438 andcc %o0, 0x7, %g1 439 sub %g1, 0x8, %g1 440 be,pn %icc, 2f 441 sub %g0, %g1, %g1 442 sub %i2, %g1, %i2 443 4441: subcc %g1, 1, %g1 445 EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1) 446 EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1) 447 bgu,pt %icc, 1b 448 add %i1, 1, %i1 449 4502: add %i1, %i3, %o0 451 andcc %i1, 0x7, %g1 452 bne,pt %icc, 8f 453 sll %g1, 3, %g1 454 455 cmp %i2, 16 456 bgeu,pt %icc, 72b 457 nop 458 ba,a,pt %xcc, 73b 459 4608: mov 64, %i3 461 andn %i1, 0x7, %i1 462 EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2) 463 sub %i3, %g1, %i3 464 andn %i2, 0x7, %i4 465 sllx %g2, %g1, %g2 4661: add %i1, 0x8, %i1 467 EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4) 468 subcc %i4, 0x8, %i4 469 srlx %g3, %i3, %i5 470 or %i5, %g2, %i5 471 EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4) 472 add %o0, 0x8, %o0 473 bgu,pt %icc, 1b 474 sllx %g3, %g1, %g2 475 476 srl %g1, 3, %g1 477 andcc %i2, 0x7, %i2 478 be,pn %icc, 85f 479 add %i1, %g1, %i1 480 ba,pt %xcc, 90f 481 sub %o0, %i1, %i3 482 483 .align 64 48480: /* 0 < len <= 16 */ 485 andcc %i3, 0x3, %g0 486 bne,pn %XCC, 90f 487 sub %o0, %i1, %i3 488 4891: 490 subcc %i2, 4, %i2 491 EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4) 492 EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4) 493 bgu,pt %XCC, 1b 494 add %i1, 4, %i1 495 49685: ret 497 restore EX_RETVAL(%i0), %g0, %o0 498 499 .align 32 50090: 501 subcc %i2, 1, %i2 502 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1) 503 EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1) 504 bgu,pt %XCC, 90b 505 add %i1, 1, %i1 506 ret 507 restore EX_RETVAL(%i0), %g0, %o0 508 509 .size FUNC_NAME, .-FUNC_NAME