fastcopy.S (20680B)
1/* 2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu> 3 * Copyright (C) 2008-2009 PetaLogix 4 * Copyright (C) 2008 Jim Law - Iris LP All rights reserved. 5 * 6 * This file is subject to the terms and conditions of the GNU General 7 * Public License. See the file COPYING in the main directory of this 8 * archive for more details. 9 * 10 * Written by Jim Law <jlaw@irispower.com> 11 * 12 * intended to replace: 13 * memcpy in memcpy.c and 14 * memmove in memmove.c 15 * ... in arch/microblaze/lib 16 * 17 * 18 * assly_fastcopy.S 19 * 20 * Attempt at quicker memcpy and memmove for MicroBlaze 21 * Input : Operand1 in Reg r5 - destination address 22 * Operand2 in Reg r6 - source address 23 * Operand3 in Reg r7 - number of bytes to transfer 24 * Output: Result in Reg r3 - starting destinaition address 25 * 26 * 27 * Explanation: 28 * Perform (possibly unaligned) copy of a block of memory 29 * between mem locations with size of xfer spec'd in bytes 30 */ 31 32#include <linux/linkage.h> 33 .text 34 .globl memcpy 35 .type memcpy, @function 36 .ent memcpy 37 38memcpy: 39fast_memcpy_ascending: 40 /* move d to return register as value of function */ 41 addi r3, r5, 0 42 43 addi r4, r0, 4 /* n = 4 */ 44 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 45 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ 46 47 /* transfer first 0~3 bytes to get aligned dest address */ 48 andi r4, r5, 3 /* n = d & 3 */ 49 /* if zero, destination already aligned */ 50 beqi r4, a_dalign_done 51 /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */ 52 rsubi r4, r4, 4 53 rsub r7, r4, r7 /* c = c - n adjust c */ 54 55a_xfer_first_loop: 56 /* if no bytes left to transfer, transfer the bulk */ 57 beqi r4, a_dalign_done 58 lbui r11, r6, 0 /* h = *s */ 59 sbi r11, r5, 0 /* *d = h */ 60 addi r6, r6, 1 /* s++ */ 61 addi r5, r5, 1 /* d++ */ 62 brid a_xfer_first_loop /* loop */ 63 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ 64 65a_dalign_done: 66 addi r4, r0, 32 /* n = 32 */ 67 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 68 /* if n < 0, less than one block to transfer */ 69 blti r4, a_block_done 70 71a_block_xfer: 72 andi r4, r7, 0xffffffe0 /* n = c & ~31 */ 73 rsub r7, r4, r7 /* c = c - n */ 74 75 andi r9, r6, 3 /* t1 = s & 3 */ 76 /* if temp != 0, unaligned transfers needed */ 77 bnei r9, a_block_unaligned 78 79a_block_aligned: 80 lwi r9, r6, 0 /* t1 = *(s + 0) */ 81 lwi r10, r6, 4 /* t2 = *(s + 4) */ 82 lwi r11, r6, 8 /* t3 = *(s + 8) */ 83 lwi r12, r6, 12 /* t4 = *(s + 12) */ 84 swi r9, r5, 0 /* *(d + 0) = t1 */ 85 swi r10, r5, 4 /* *(d + 4) = t2 */ 86 swi r11, r5, 8 /* *(d + 8) = t3 */ 87 swi r12, r5, 12 /* *(d + 12) = t4 */ 88 lwi r9, r6, 16 /* t1 = *(s + 16) */ 89 lwi r10, r6, 20 /* t2 = *(s + 20) */ 90 lwi r11, r6, 24 /* t3 = *(s + 24) */ 91 lwi r12, r6, 28 /* t4 = *(s + 28) */ 92 swi r9, r5, 16 /* *(d + 16) = t1 */ 93 swi r10, r5, 20 /* *(d + 20) = t2 */ 94 swi r11, r5, 24 /* *(d + 24) = t3 */ 95 swi r12, r5, 28 /* *(d + 28) = t4 */ 96 addi r6, r6, 32 /* s = s + 32 */ 97 addi r4, r4, -32 /* n = n - 32 */ 98 bneid r4, a_block_aligned /* while (n) loop */ 99 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 100 bri a_block_done 101 102a_block_unaligned: 103 andi r8, r6, 0xfffffffc /* as = s & ~3 */ 104 add r6, r6, r4 /* s = s + n */ 105 lwi r11, r8, 0 /* h = *(as + 0) */ 106 107 addi r9, r9, -1 108 beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */ 109 addi r9, r9, -1 110 beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */ 111 112a_block_u3: 113 bslli r11, r11, 24 /* h = h << 24 */ 114a_bu3_loop: 115 lwi r12, r8, 4 /* v = *(as + 4) */ 116 bsrli r9, r12, 8 /* t1 = v >> 8 */ 117 or r9, r11, r9 /* t1 = h | t1 */ 118 swi r9, r5, 0 /* *(d + 0) = t1 */ 119 bslli r11, r12, 24 /* h = v << 24 */ 120 lwi r12, r8, 8 /* v = *(as + 8) */ 121 bsrli r9, r12, 8 /* t1 = v >> 8 */ 122 or r9, r11, r9 /* t1 = h | t1 */ 123 swi r9, r5, 4 /* *(d + 4) = t1 */ 124 bslli r11, r12, 24 /* h = v << 24 */ 125 lwi r12, r8, 12 /* v = *(as + 12) */ 126 bsrli r9, r12, 8 /* t1 = v >> 8 */ 127 or r9, r11, r9 /* t1 = h | t1 */ 128 swi r9, r5, 8 /* *(d + 8) = t1 */ 129 bslli r11, r12, 24 /* h = v << 24 */ 130 lwi r12, r8, 16 /* v = *(as + 16) */ 131 bsrli r9, r12, 8 /* t1 = v >> 8 */ 132 or r9, r11, r9 /* t1 = h | t1 */ 133 swi r9, r5, 12 /* *(d + 12) = t1 */ 134 bslli r11, r12, 24 /* h = v << 24 */ 135 lwi r12, r8, 20 /* v = *(as + 20) */ 136 bsrli r9, r12, 8 /* t1 = v >> 8 */ 137 or r9, r11, r9 /* t1 = h | t1 */ 138 swi r9, r5, 16 /* *(d + 16) = t1 */ 139 bslli r11, r12, 24 /* h = v << 24 */ 140 lwi r12, r8, 24 /* v = *(as + 24) */ 141 bsrli r9, r12, 8 /* t1 = v >> 8 */ 142 or r9, r11, r9 /* t1 = h | t1 */ 143 swi r9, r5, 20 /* *(d + 20) = t1 */ 144 bslli r11, r12, 24 /* h = v << 24 */ 145 lwi r12, r8, 28 /* v = *(as + 28) */ 146 bsrli r9, r12, 8 /* t1 = v >> 8 */ 147 or r9, r11, r9 /* t1 = h | t1 */ 148 swi r9, r5, 24 /* *(d + 24) = t1 */ 149 bslli r11, r12, 24 /* h = v << 24 */ 150 lwi r12, r8, 32 /* v = *(as + 32) */ 151 bsrli r9, r12, 8 /* t1 = v >> 8 */ 152 or r9, r11, r9 /* t1 = h | t1 */ 153 swi r9, r5, 28 /* *(d + 28) = t1 */ 154 bslli r11, r12, 24 /* h = v << 24 */ 155 addi r8, r8, 32 /* as = as + 32 */ 156 addi r4, r4, -32 /* n = n - 32 */ 157 bneid r4, a_bu3_loop /* while (n) loop */ 158 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 159 bri a_block_done 160 161a_block_u1: 162 bslli r11, r11, 8 /* h = h << 8 */ 163a_bu1_loop: 164 lwi r12, r8, 4 /* v = *(as + 4) */ 165 bsrli r9, r12, 24 /* t1 = v >> 24 */ 166 or r9, r11, r9 /* t1 = h | t1 */ 167 swi r9, r5, 0 /* *(d + 0) = t1 */ 168 bslli r11, r12, 8 /* h = v << 8 */ 169 lwi r12, r8, 8 /* v = *(as + 8) */ 170 bsrli r9, r12, 24 /* t1 = v >> 24 */ 171 or r9, r11, r9 /* t1 = h | t1 */ 172 swi r9, r5, 4 /* *(d + 4) = t1 */ 173 bslli r11, r12, 8 /* h = v << 8 */ 174 lwi r12, r8, 12 /* v = *(as + 12) */ 175 bsrli r9, r12, 24 /* t1 = v >> 24 */ 176 or r9, r11, r9 /* t1 = h | t1 */ 177 swi r9, r5, 8 /* *(d + 8) = t1 */ 178 bslli r11, r12, 8 /* h = v << 8 */ 179 lwi r12, r8, 16 /* v = *(as + 16) */ 180 bsrli r9, r12, 24 /* t1 = v >> 24 */ 181 or r9, r11, r9 /* t1 = h | t1 */ 182 swi r9, r5, 12 /* *(d + 12) = t1 */ 183 bslli r11, r12, 8 /* h = v << 8 */ 184 lwi r12, r8, 20 /* v = *(as + 20) */ 185 bsrli r9, r12, 24 /* t1 = v >> 24 */ 186 or r9, r11, r9 /* t1 = h | t1 */ 187 swi r9, r5, 16 /* *(d + 16) = t1 */ 188 bslli r11, r12, 8 /* h = v << 8 */ 189 lwi r12, r8, 24 /* v = *(as + 24) */ 190 bsrli r9, r12, 24 /* t1 = v >> 24 */ 191 or r9, r11, r9 /* t1 = h | t1 */ 192 swi r9, r5, 20 /* *(d + 20) = t1 */ 193 bslli r11, r12, 8 /* h = v << 8 */ 194 lwi r12, r8, 28 /* v = *(as + 28) */ 195 bsrli r9, r12, 24 /* t1 = v >> 24 */ 196 or r9, r11, r9 /* t1 = h | t1 */ 197 swi r9, r5, 24 /* *(d + 24) = t1 */ 198 bslli r11, r12, 8 /* h = v << 8 */ 199 lwi r12, r8, 32 /* v = *(as + 32) */ 200 bsrli r9, r12, 24 /* t1 = v >> 24 */ 201 or r9, r11, r9 /* t1 = h | t1 */ 202 swi r9, r5, 28 /* *(d + 28) = t1 */ 203 bslli r11, r12, 8 /* h = v << 8 */ 204 addi r8, r8, 32 /* as = as + 32 */ 205 addi r4, r4, -32 /* n = n - 32 */ 206 bneid r4, a_bu1_loop /* while (n) loop */ 207 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 208 bri a_block_done 209 210a_block_u2: 211 bslli r11, r11, 16 /* h = h << 16 */ 212a_bu2_loop: 213 lwi r12, r8, 4 /* v = *(as + 4) */ 214 bsrli r9, r12, 16 /* t1 = v >> 16 */ 215 or r9, r11, r9 /* t1 = h | t1 */ 216 swi r9, r5, 0 /* *(d + 0) = t1 */ 217 bslli r11, r12, 16 /* h = v << 16 */ 218 lwi r12, r8, 8 /* v = *(as + 8) */ 219 bsrli r9, r12, 16 /* t1 = v >> 16 */ 220 or r9, r11, r9 /* t1 = h | t1 */ 221 swi r9, r5, 4 /* *(d + 4) = t1 */ 222 bslli r11, r12, 16 /* h = v << 16 */ 223 lwi r12, r8, 12 /* v = *(as + 12) */ 224 bsrli r9, r12, 16 /* t1 = v >> 16 */ 225 or r9, r11, r9 /* t1 = h | t1 */ 226 swi r9, r5, 8 /* *(d + 8) = t1 */ 227 bslli r11, r12, 16 /* h = v << 16 */ 228 lwi r12, r8, 16 /* v = *(as + 16) */ 229 bsrli r9, r12, 16 /* t1 = v >> 16 */ 230 or r9, r11, r9 /* t1 = h | t1 */ 231 swi r9, r5, 12 /* *(d + 12) = t1 */ 232 bslli r11, r12, 16 /* h = v << 16 */ 233 lwi r12, r8, 20 /* v = *(as + 20) */ 234 bsrli r9, r12, 16 /* t1 = v >> 16 */ 235 or r9, r11, r9 /* t1 = h | t1 */ 236 swi r9, r5, 16 /* *(d + 16) = t1 */ 237 bslli r11, r12, 16 /* h = v << 16 */ 238 lwi r12, r8, 24 /* v = *(as + 24) */ 239 bsrli r9, r12, 16 /* t1 = v >> 16 */ 240 or r9, r11, r9 /* t1 = h | t1 */ 241 swi r9, r5, 20 /* *(d + 20) = t1 */ 242 bslli r11, r12, 16 /* h = v << 16 */ 243 lwi r12, r8, 28 /* v = *(as + 28) */ 244 bsrli r9, r12, 16 /* t1 = v >> 16 */ 245 or r9, r11, r9 /* t1 = h | t1 */ 246 swi r9, r5, 24 /* *(d + 24) = t1 */ 247 bslli r11, r12, 16 /* h = v << 16 */ 248 lwi r12, r8, 32 /* v = *(as + 32) */ 249 bsrli r9, r12, 16 /* t1 = v >> 16 */ 250 or r9, r11, r9 /* t1 = h | t1 */ 251 swi r9, r5, 28 /* *(d + 28) = t1 */ 252 bslli r11, r12, 16 /* h = v << 16 */ 253 addi r8, r8, 32 /* as = as + 32 */ 254 addi r4, r4, -32 /* n = n - 32 */ 255 bneid r4, a_bu2_loop /* while (n) loop */ 256 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ 257 258a_block_done: 259 addi r4, r0, 4 /* n = 4 */ 260 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 261 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ 262 263a_word_xfer: 264 andi r4, r7, 0xfffffffc /* n = c & ~3 */ 265 addi r10, r0, 0 /* offset = 0 */ 266 267 andi r9, r6, 3 /* t1 = s & 3 */ 268 /* if temp != 0, unaligned transfers needed */ 269 bnei r9, a_word_unaligned 270 271a_word_aligned: 272 lw r9, r6, r10 /* t1 = *(s+offset) */ 273 sw r9, r5, r10 /* *(d+offset) = t1 */ 274 addi r4, r4,-4 /* n-- */ 275 bneid r4, a_word_aligned /* loop */ 276 addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */ 277 278 bri a_word_done 279 280a_word_unaligned: 281 andi r8, r6, 0xfffffffc /* as = s & ~3 */ 282 lwi r11, r8, 0 /* h = *(as + 0) */ 283 addi r8, r8, 4 /* as = as + 4 */ 284 285 addi r9, r9, -1 286 beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */ 287 addi r9, r9, -1 288 beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */ 289 290a_word_u3: 291 bslli r11, r11, 24 /* h = h << 24 */ 292a_wu3_loop: 293 lw r12, r8, r10 /* v = *(as + offset) */ 294 bsrli r9, r12, 8 /* t1 = v >> 8 */ 295 or r9, r11, r9 /* t1 = h | t1 */ 296 sw r9, r5, r10 /* *(d + offset) = t1 */ 297 bslli r11, r12, 24 /* h = v << 24 */ 298 addi r4, r4,-4 /* n = n - 4 */ 299 bneid r4, a_wu3_loop /* while (n) loop */ 300 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ 301 302 bri a_word_done 303 304a_word_u1: 305 bslli r11, r11, 8 /* h = h << 8 */ 306a_wu1_loop: 307 lw r12, r8, r10 /* v = *(as + offset) */ 308 bsrli r9, r12, 24 /* t1 = v >> 24 */ 309 or r9, r11, r9 /* t1 = h | t1 */ 310 sw r9, r5, r10 /* *(d + offset) = t1 */ 311 bslli r11, r12, 8 /* h = v << 8 */ 312 addi r4, r4,-4 /* n = n - 4 */ 313 bneid r4, a_wu1_loop /* while (n) loop */ 314 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ 315 316 bri a_word_done 317 318a_word_u2: 319 bslli r11, r11, 16 /* h = h << 16 */ 320a_wu2_loop: 321 lw r12, r8, r10 /* v = *(as + offset) */ 322 bsrli r9, r12, 16 /* t1 = v >> 16 */ 323 or r9, r11, r9 /* t1 = h | t1 */ 324 sw r9, r5, r10 /* *(d + offset) = t1 */ 325 bslli r11, r12, 16 /* h = v << 16 */ 326 addi r4, r4,-4 /* n = n - 4 */ 327 bneid r4, a_wu2_loop /* while (n) loop */ 328 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ 329 330a_word_done: 331 add r5, r5, r10 /* d = d + offset */ 332 add r6, r6, r10 /* s = s + offset */ 333 rsub r7, r10, r7 /* c = c - offset */ 334 335a_xfer_end: 336a_xfer_end_loop: 337 beqi r7, a_done /* while (c) */ 338 lbui r9, r6, 0 /* t1 = *s */ 339 addi r6, r6, 1 /* s++ */ 340 sbi r9, r5, 0 /* *d = t1 */ 341 addi r7, r7, -1 /* c-- */ 342 brid a_xfer_end_loop /* loop */ 343 addi r5, r5, 1 /* d++ (IN DELAY SLOT) */ 344 345a_done: 346 rtsd r15, 8 347 nop 348 349.size memcpy, . - memcpy 350.end memcpy 351/*----------------------------------------------------------------------------*/ 352 .globl memmove 353 .type memmove, @function 354 .ent memmove 355 356memmove: 357 cmpu r4, r5, r6 /* n = s - d */ 358 bgei r4,fast_memcpy_ascending 359 360fast_memcpy_descending: 361 /* move d to return register as value of function */ 362 addi r3, r5, 0 363 364 add r5, r5, r7 /* d = d + c */ 365 add r6, r6, r7 /* s = s + c */ 366 367 addi r4, r0, 4 /* n = 4 */ 368 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 369 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ 370 371 /* transfer first 0~3 bytes to get aligned dest address */ 372 andi r4, r5, 3 /* n = d & 3 */ 373 /* if zero, destination already aligned */ 374 beqi r4,d_dalign_done 375 rsub r7, r4, r7 /* c = c - n adjust c */ 376 377d_xfer_first_loop: 378 /* if no bytes left to transfer, transfer the bulk */ 379 beqi r4,d_dalign_done 380 addi r6, r6, -1 /* s-- */ 381 addi r5, r5, -1 /* d-- */ 382 lbui r11, r6, 0 /* h = *s */ 383 sbi r11, r5, 0 /* *d = h */ 384 brid d_xfer_first_loop /* loop */ 385 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ 386 387d_dalign_done: 388 addi r4, r0, 32 /* n = 32 */ 389 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 390 /* if n < 0, less than one block to transfer */ 391 blti r4, d_block_done 392 393d_block_xfer: 394 andi r4, r7, 0xffffffe0 /* n = c & ~31 */ 395 rsub r7, r4, r7 /* c = c - n */ 396 397 andi r9, r6, 3 /* t1 = s & 3 */ 398 /* if temp != 0, unaligned transfers needed */ 399 bnei r9, d_block_unaligned 400 401d_block_aligned: 402 addi r6, r6, -32 /* s = s - 32 */ 403 addi r5, r5, -32 /* d = d - 32 */ 404 lwi r9, r6, 28 /* t1 = *(s + 28) */ 405 lwi r10, r6, 24 /* t2 = *(s + 24) */ 406 lwi r11, r6, 20 /* t3 = *(s + 20) */ 407 lwi r12, r6, 16 /* t4 = *(s + 16) */ 408 swi r9, r5, 28 /* *(d + 28) = t1 */ 409 swi r10, r5, 24 /* *(d + 24) = t2 */ 410 swi r11, r5, 20 /* *(d + 20) = t3 */ 411 swi r12, r5, 16 /* *(d + 16) = t4 */ 412 lwi r9, r6, 12 /* t1 = *(s + 12) */ 413 lwi r10, r6, 8 /* t2 = *(s + 8) */ 414 lwi r11, r6, 4 /* t3 = *(s + 4) */ 415 lwi r12, r6, 0 /* t4 = *(s + 0) */ 416 swi r9, r5, 12 /* *(d + 12) = t1 */ 417 swi r10, r5, 8 /* *(d + 8) = t2 */ 418 swi r11, r5, 4 /* *(d + 4) = t3 */ 419 addi r4, r4, -32 /* n = n - 32 */ 420 bneid r4, d_block_aligned /* while (n) loop */ 421 swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */ 422 bri d_block_done 423 424d_block_unaligned: 425 andi r8, r6, 0xfffffffc /* as = s & ~3 */ 426 rsub r6, r4, r6 /* s = s - n */ 427 lwi r11, r8, 0 /* h = *(as + 0) */ 428 429 addi r9, r9, -1 430 beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */ 431 addi r9, r9, -1 432 beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */ 433 434d_block_u3: 435 bsrli r11, r11, 8 /* h = h >> 8 */ 436d_bu3_loop: 437 addi r8, r8, -32 /* as = as - 32 */ 438 addi r5, r5, -32 /* d = d - 32 */ 439 lwi r12, r8, 28 /* v = *(as + 28) */ 440 bslli r9, r12, 24 /* t1 = v << 24 */ 441 or r9, r11, r9 /* t1 = h | t1 */ 442 swi r9, r5, 28 /* *(d + 28) = t1 */ 443 bsrli r11, r12, 8 /* h = v >> 8 */ 444 lwi r12, r8, 24 /* v = *(as + 24) */ 445 bslli r9, r12, 24 /* t1 = v << 24 */ 446 or r9, r11, r9 /* t1 = h | t1 */ 447 swi r9, r5, 24 /* *(d + 24) = t1 */ 448 bsrli r11, r12, 8 /* h = v >> 8 */ 449 lwi r12, r8, 20 /* v = *(as + 20) */ 450 bslli r9, r12, 24 /* t1 = v << 24 */ 451 or r9, r11, r9 /* t1 = h | t1 */ 452 swi r9, r5, 20 /* *(d + 20) = t1 */ 453 bsrli r11, r12, 8 /* h = v >> 8 */ 454 lwi r12, r8, 16 /* v = *(as + 16) */ 455 bslli r9, r12, 24 /* t1 = v << 24 */ 456 or r9, r11, r9 /* t1 = h | t1 */ 457 swi r9, r5, 16 /* *(d + 16) = t1 */ 458 bsrli r11, r12, 8 /* h = v >> 8 */ 459 lwi r12, r8, 12 /* v = *(as + 12) */ 460 bslli r9, r12, 24 /* t1 = v << 24 */ 461 or r9, r11, r9 /* t1 = h | t1 */ 462 swi r9, r5, 12 /* *(d + 112) = t1 */ 463 bsrli r11, r12, 8 /* h = v >> 8 */ 464 lwi r12, r8, 8 /* v = *(as + 8) */ 465 bslli r9, r12, 24 /* t1 = v << 24 */ 466 or r9, r11, r9 /* t1 = h | t1 */ 467 swi r9, r5, 8 /* *(d + 8) = t1 */ 468 bsrli r11, r12, 8 /* h = v >> 8 */ 469 lwi r12, r8, 4 /* v = *(as + 4) */ 470 bslli r9, r12, 24 /* t1 = v << 24 */ 471 or r9, r11, r9 /* t1 = h | t1 */ 472 swi r9, r5, 4 /* *(d + 4) = t1 */ 473 bsrli r11, r12, 8 /* h = v >> 8 */ 474 lwi r12, r8, 0 /* v = *(as + 0) */ 475 bslli r9, r12, 24 /* t1 = v << 24 */ 476 or r9, r11, r9 /* t1 = h | t1 */ 477 swi r9, r5, 0 /* *(d + 0) = t1 */ 478 addi r4, r4, -32 /* n = n - 32 */ 479 bneid r4, d_bu3_loop /* while (n) loop */ 480 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ 481 bri d_block_done 482 483d_block_u1: 484 bsrli r11, r11, 24 /* h = h >> 24 */ 485d_bu1_loop: 486 addi r8, r8, -32 /* as = as - 32 */ 487 addi r5, r5, -32 /* d = d - 32 */ 488 lwi r12, r8, 28 /* v = *(as + 28) */ 489 bslli r9, r12, 8 /* t1 = v << 8 */ 490 or r9, r11, r9 /* t1 = h | t1 */ 491 swi r9, r5, 28 /* *(d + 28) = t1 */ 492 bsrli r11, r12, 24 /* h = v >> 24 */ 493 lwi r12, r8, 24 /* v = *(as + 24) */ 494 bslli r9, r12, 8 /* t1 = v << 8 */ 495 or r9, r11, r9 /* t1 = h | t1 */ 496 swi r9, r5, 24 /* *(d + 24) = t1 */ 497 bsrli r11, r12, 24 /* h = v >> 24 */ 498 lwi r12, r8, 20 /* v = *(as + 20) */ 499 bslli r9, r12, 8 /* t1 = v << 8 */ 500 or r9, r11, r9 /* t1 = h | t1 */ 501 swi r9, r5, 20 /* *(d + 20) = t1 */ 502 bsrli r11, r12, 24 /* h = v >> 24 */ 503 lwi r12, r8, 16 /* v = *(as + 16) */ 504 bslli r9, r12, 8 /* t1 = v << 8 */ 505 or r9, r11, r9 /* t1 = h | t1 */ 506 swi r9, r5, 16 /* *(d + 16) = t1 */ 507 bsrli r11, r12, 24 /* h = v >> 24 */ 508 lwi r12, r8, 12 /* v = *(as + 12) */ 509 bslli r9, r12, 8 /* t1 = v << 8 */ 510 or r9, r11, r9 /* t1 = h | t1 */ 511 swi r9, r5, 12 /* *(d + 112) = t1 */ 512 bsrli r11, r12, 24 /* h = v >> 24 */ 513 lwi r12, r8, 8 /* v = *(as + 8) */ 514 bslli r9, r12, 8 /* t1 = v << 8 */ 515 or r9, r11, r9 /* t1 = h | t1 */ 516 swi r9, r5, 8 /* *(d + 8) = t1 */ 517 bsrli r11, r12, 24 /* h = v >> 24 */ 518 lwi r12, r8, 4 /* v = *(as + 4) */ 519 bslli r9, r12, 8 /* t1 = v << 8 */ 520 or r9, r11, r9 /* t1 = h | t1 */ 521 swi r9, r5, 4 /* *(d + 4) = t1 */ 522 bsrli r11, r12, 24 /* h = v >> 24 */ 523 lwi r12, r8, 0 /* v = *(as + 0) */ 524 bslli r9, r12, 8 /* t1 = v << 8 */ 525 or r9, r11, r9 /* t1 = h | t1 */ 526 swi r9, r5, 0 /* *(d + 0) = t1 */ 527 addi r4, r4, -32 /* n = n - 32 */ 528 bneid r4, d_bu1_loop /* while (n) loop */ 529 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ 530 bri d_block_done 531 532d_block_u2: 533 bsrli r11, r11, 16 /* h = h >> 16 */ 534d_bu2_loop: 535 addi r8, r8, -32 /* as = as - 32 */ 536 addi r5, r5, -32 /* d = d - 32 */ 537 lwi r12, r8, 28 /* v = *(as + 28) */ 538 bslli r9, r12, 16 /* t1 = v << 16 */ 539 or r9, r11, r9 /* t1 = h | t1 */ 540 swi r9, r5, 28 /* *(d + 28) = t1 */ 541 bsrli r11, r12, 16 /* h = v >> 16 */ 542 lwi r12, r8, 24 /* v = *(as + 24) */ 543 bslli r9, r12, 16 /* t1 = v << 16 */ 544 or r9, r11, r9 /* t1 = h | t1 */ 545 swi r9, r5, 24 /* *(d + 24) = t1 */ 546 bsrli r11, r12, 16 /* h = v >> 16 */ 547 lwi r12, r8, 20 /* v = *(as + 20) */ 548 bslli r9, r12, 16 /* t1 = v << 16 */ 549 or r9, r11, r9 /* t1 = h | t1 */ 550 swi r9, r5, 20 /* *(d + 20) = t1 */ 551 bsrli r11, r12, 16 /* h = v >> 16 */ 552 lwi r12, r8, 16 /* v = *(as + 16) */ 553 bslli r9, r12, 16 /* t1 = v << 16 */ 554 or r9, r11, r9 /* t1 = h | t1 */ 555 swi r9, r5, 16 /* *(d + 16) = t1 */ 556 bsrli r11, r12, 16 /* h = v >> 16 */ 557 lwi r12, r8, 12 /* v = *(as + 12) */ 558 bslli r9, r12, 16 /* t1 = v << 16 */ 559 or r9, r11, r9 /* t1 = h | t1 */ 560 swi r9, r5, 12 /* *(d + 112) = t1 */ 561 bsrli r11, r12, 16 /* h = v >> 16 */ 562 lwi r12, r8, 8 /* v = *(as + 8) */ 563 bslli r9, r12, 16 /* t1 = v << 16 */ 564 or r9, r11, r9 /* t1 = h | t1 */ 565 swi r9, r5, 8 /* *(d + 8) = t1 */ 566 bsrli r11, r12, 16 /* h = v >> 16 */ 567 lwi r12, r8, 4 /* v = *(as + 4) */ 568 bslli r9, r12, 16 /* t1 = v << 16 */ 569 or r9, r11, r9 /* t1 = h | t1 */ 570 swi r9, r5, 4 /* *(d + 4) = t1 */ 571 bsrli r11, r12, 16 /* h = v >> 16 */ 572 lwi r12, r8, 0 /* v = *(as + 0) */ 573 bslli r9, r12, 16 /* t1 = v << 16 */ 574 or r9, r11, r9 /* t1 = h | t1 */ 575 swi r9, r5, 0 /* *(d + 0) = t1 */ 576 addi r4, r4, -32 /* n = n - 32 */ 577 bneid r4, d_bu2_loop /* while (n) loop */ 578 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ 579 580d_block_done: 581 addi r4, r0, 4 /* n = 4 */ 582 cmpu r4, r4, r7 /* n = c - n (unsigned) */ 583 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ 584 585d_word_xfer: 586 andi r4, r7, 0xfffffffc /* n = c & ~3 */ 587 rsub r5, r4, r5 /* d = d - n */ 588 rsub r6, r4, r6 /* s = s - n */ 589 rsub r7, r4, r7 /* c = c - n */ 590 591 andi r9, r6, 3 /* t1 = s & 3 */ 592 /* if temp != 0, unaligned transfers needed */ 593 bnei r9, d_word_unaligned 594 595d_word_aligned: 596 addi r4, r4,-4 /* n-- */ 597 lw r9, r6, r4 /* t1 = *(s+n) */ 598 bneid r4, d_word_aligned /* loop */ 599 sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */ 600 601 bri d_word_done 602 603d_word_unaligned: 604 andi r8, r6, 0xfffffffc /* as = s & ~3 */ 605 lw r11, r8, r4 /* h = *(as + n) */ 606 607 addi r9, r9, -1 608 beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */ 609 addi r9, r9, -1 610 beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */ 611 612d_word_u3: 613 bsrli r11, r11, 8 /* h = h >> 8 */ 614d_wu3_loop: 615 addi r4, r4,-4 /* n = n - 4 */ 616 lw r12, r8, r4 /* v = *(as + n) */ 617 bslli r9, r12, 24 /* t1 = v << 24 */ 618 or r9, r11, r9 /* t1 = h | t1 */ 619 sw r9, r5, r4 /* *(d + n) = t1 */ 620 bneid r4, d_wu3_loop /* while (n) loop */ 621 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ 622 623 bri d_word_done 624 625d_word_u1: 626 bsrli r11, r11, 24 /* h = h >> 24 */ 627d_wu1_loop: 628 addi r4, r4,-4 /* n = n - 4 */ 629 lw r12, r8, r4 /* v = *(as + n) */ 630 bslli r9, r12, 8 /* t1 = v << 8 */ 631 or r9, r11, r9 /* t1 = h | t1 */ 632 sw r9, r5, r4 /* *(d + n) = t1 */ 633 bneid r4, d_wu1_loop /* while (n) loop */ 634 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ 635 636 bri d_word_done 637 638d_word_u2: 639 bsrli r11, r11, 16 /* h = h >> 16 */ 640d_wu2_loop: 641 addi r4, r4,-4 /* n = n - 4 */ 642 lw r12, r8, r4 /* v = *(as + n) */ 643 bslli r9, r12, 16 /* t1 = v << 16 */ 644 or r9, r11, r9 /* t1 = h | t1 */ 645 sw r9, r5, r4 /* *(d + n) = t1 */ 646 bneid r4, d_wu2_loop /* while (n) loop */ 647 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ 648 649d_word_done: 650 651d_xfer_end: 652d_xfer_end_loop: 653 beqi r7, a_done /* while (c) */ 654 addi r6, r6, -1 /* s-- */ 655 lbui r9, r6, 0 /* t1 = *s */ 656 addi r5, r5, -1 /* d-- */ 657 sbi r9, r5, 0 /* *d = t1 */ 658 brid d_xfer_end_loop /* loop */ 659 addi r7, r7, -1 /* c-- (IN DELAY SLOT) */ 660 661d_done: 662 rtsd r15, 8 663 nop 664 665.size memmove, . - memmove 666.end memmove