memmove.S (8025B)
1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com> 4 */ 5 6#include <linux/linkage.h> 7#include <asm/asm.h> 8 9SYM_FUNC_START(__memmove) 10SYM_FUNC_START_WEAK(memmove) 11 /* 12 * Returns 13 * a0 - dest 14 * 15 * Parameters 16 * a0 - Inclusive first byte of dest 17 * a1 - Inclusive first byte of src 18 * a2 - Length of copy n 19 * 20 * Because the return matches the parameter register a0, 21 * we will not clobber or modify that register. 22 * 23 * Note: This currently only works on little-endian. 24 * To port to big-endian, reverse the direction of shifts 25 * in the 2 misaligned fixup copy loops. 26 */ 27 28 /* Return if nothing to do */ 29 beq a0, a1, return_from_memmove 30 beqz a2, return_from_memmove 31 32 /* 33 * Register Uses 34 * Forward Copy: a1 - Index counter of src 35 * Reverse Copy: a4 - Index counter of src 36 * Forward Copy: t3 - Index counter of dest 37 * Reverse Copy: t4 - Index counter of dest 38 * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest 39 * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest 40 * Both Copy Modes: t0 - Link / Temporary for load-store 41 * Both Copy Modes: t1 - Temporary for load-store 42 * Both Copy Modes: t2 - Temporary for load-store 43 * Both Copy Modes: a5 - dest to src alignment offset 44 * Both Copy Modes: a6 - Shift ammount 45 * Both Copy Modes: a7 - Inverse Shift ammount 46 * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops 47 */ 48 49 /* 50 * Solve for some register values now. 51 * Byte copy does not need t5 or t6. 52 */ 53 mv t3, a0 54 add t4, a0, a2 55 add a4, a1, a2 56 57 /* 58 * Byte copy if copying less than (2 * SZREG) bytes. This can 59 * cause problems with the bulk copy implementation and is 60 * small enough not to bother. 61 */ 62 andi t0, a2, -(2 * SZREG) 63 beqz t0, byte_copy 64 65 /* 66 * Now solve for t5 and t6. 67 */ 68 andi t5, t3, -SZREG 69 andi t6, t4, -SZREG 70 /* 71 * If dest(Register t3) rounded down to the nearest naturally 72 * aligned SZREG address, does not equal dest, then add SZREG 73 * to find the low-bound of SZREG alignment in the dest memory 74 * region. Note that this could overshoot the dest memory 75 * region if n is less than SZREG. This is one reason why 76 * we always byte copy if n is less than SZREG. 77 * Otherwise, dest is already naturally aligned to SZREG. 78 */ 79 beq t5, t3, 1f 80 addi t5, t5, SZREG 81 1: 82 83 /* 84 * If the dest and src are co-aligned to SZREG, then there is 85 * no need for the full rigmarole of a full misaligned fixup copy. 86 * Instead, do a simpler co-aligned copy. 87 */ 88 xor t0, a0, a1 89 andi t1, t0, (SZREG - 1) 90 beqz t1, coaligned_copy 91 /* Fall through to misaligned fixup copy */ 92 93misaligned_fixup_copy: 94 bltu a1, a0, misaligned_fixup_copy_reverse 95 96misaligned_fixup_copy_forward: 97 jal t0, byte_copy_until_aligned_forward 98 99 andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */ 100 slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ 101 sub a5, a1, t3 /* Find the difference between src and dest */ 102 andi a1, a1, -SZREG /* Align the src pointer */ 103 addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/ 104 105 /* 106 * Compute The Inverse Shift 107 * a7 = XLEN - a6 = XLEN + -a6 108 * 2s complement negation to find the negative: -a6 = ~a6 + 1 109 * Add that to XLEN. XLEN = SZREG * 8. 110 */ 111 not a7, a6 112 addi a7, a7, (SZREG * 8 + 1) 113 114 /* 115 * Fix Misalignment Copy Loop - Forward 116 * load_val0 = load_ptr[0]; 117 * do { 118 * load_val1 = load_ptr[1]; 119 * store_ptr += 2; 120 * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7}); 121 * 122 * if (store_ptr == {a2}) 123 * break; 124 * 125 * load_val0 = load_ptr[2]; 126 * load_ptr += 2; 127 * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7}); 128 * 129 * } while (store_ptr != store_ptr_end); 130 * store_ptr = store_ptr_end; 131 */ 132 133 REG_L t0, (0 * SZREG)(a1) 134 1: 135 REG_L t1, (1 * SZREG)(a1) 136 addi t3, t3, (2 * SZREG) 137 srl t0, t0, a6 138 sll t2, t1, a7 139 or t2, t0, t2 140 REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3) 141 142 beq t3, a2, 2f 143 144 REG_L t0, (2 * SZREG)(a1) 145 addi a1, a1, (2 * SZREG) 146 srl t1, t1, a6 147 sll t2, t0, a7 148 or t2, t1, t2 149 REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3) 150 151 bne t3, t6, 1b 152 2: 153 mv t3, t6 /* Fix the dest pointer in case the loop was broken */ 154 155 add a1, t3, a5 /* Restore the src pointer */ 156 j byte_copy_forward /* Copy any remaining bytes */ 157 158misaligned_fixup_copy_reverse: 159 jal t0, byte_copy_until_aligned_reverse 160 161 andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */ 162 slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ 163 sub a5, a4, t4 /* Find the difference between src and dest */ 164 andi a4, a4, -SZREG /* Align the src pointer */ 165 addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/ 166 167 /* 168 * Compute The Inverse Shift 169 * a7 = XLEN - a6 = XLEN + -a6 170 * 2s complement negation to find the negative: -a6 = ~a6 + 1 171 * Add that to XLEN. XLEN = SZREG * 8. 172 */ 173 not a7, a6 174 addi a7, a7, (SZREG * 8 + 1) 175 176 /* 177 * Fix Misalignment Copy Loop - Reverse 178 * load_val1 = load_ptr[0]; 179 * do { 180 * load_val0 = load_ptr[-1]; 181 * store_ptr -= 2; 182 * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7}); 183 * 184 * if (store_ptr == {a2}) 185 * break; 186 * 187 * load_val1 = load_ptr[-2]; 188 * load_ptr -= 2; 189 * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7}); 190 * 191 * } while (store_ptr != store_ptr_end); 192 * store_ptr = store_ptr_end; 193 */ 194 195 REG_L t1, ( 0 * SZREG)(a4) 196 1: 197 REG_L t0, (-1 * SZREG)(a4) 198 addi t4, t4, (-2 * SZREG) 199 sll t1, t1, a7 200 srl t2, t0, a6 201 or t2, t1, t2 202 REG_S t2, ( 1 * SZREG)(t4) 203 204 beq t4, a2, 2f 205 206 REG_L t1, (-2 * SZREG)(a4) 207 addi a4, a4, (-2 * SZREG) 208 sll t0, t0, a7 209 srl t2, t1, a6 210 or t2, t0, t2 211 REG_S t2, ( 0 * SZREG)(t4) 212 213 bne t4, t5, 1b 214 2: 215 mv t4, t5 /* Fix the dest pointer in case the loop was broken */ 216 217 add a4, t4, a5 /* Restore the src pointer */ 218 j byte_copy_reverse /* Copy any remaining bytes */ 219 220/* 221 * Simple copy loops for SZREG co-aligned memory locations. 222 * These also make calls to do byte copies for any unaligned 223 * data at their terminations. 224 */ 225coaligned_copy: 226 bltu a1, a0, coaligned_copy_reverse 227 228coaligned_copy_forward: 229 jal t0, byte_copy_until_aligned_forward 230 231 1: 232 REG_L t1, ( 0 * SZREG)(a1) 233 addi a1, a1, SZREG 234 addi t3, t3, SZREG 235 REG_S t1, (-1 * SZREG)(t3) 236 bne t3, t6, 1b 237 238 j byte_copy_forward /* Copy any remaining bytes */ 239 240coaligned_copy_reverse: 241 jal t0, byte_copy_until_aligned_reverse 242 243 1: 244 REG_L t1, (-1 * SZREG)(a4) 245 addi a4, a4, -SZREG 246 addi t4, t4, -SZREG 247 REG_S t1, ( 0 * SZREG)(t4) 248 bne t4, t5, 1b 249 250 j byte_copy_reverse /* Copy any remaining bytes */ 251 252/* 253 * These are basically sub-functions within the function. They 254 * are used to byte copy until the dest pointer is in alignment. 255 * At which point, a bulk copy method can be used by the 256 * calling code. These work on the same registers as the bulk 257 * copy loops. Therefore, the register values can be picked 258 * up from where they were left and we avoid code duplication 259 * without any overhead except the call in and return jumps. 260 */ 261byte_copy_until_aligned_forward: 262 beq t3, t5, 2f 263 1: 264 lb t1, 0(a1) 265 addi a1, a1, 1 266 addi t3, t3, 1 267 sb t1, -1(t3) 268 bne t3, t5, 1b 269 2: 270 jalr zero, 0x0(t0) /* Return to multibyte copy loop */ 271 272byte_copy_until_aligned_reverse: 273 beq t4, t6, 2f 274 1: 275 lb t1, -1(a4) 276 addi a4, a4, -1 277 addi t4, t4, -1 278 sb t1, 0(t4) 279 bne t4, t6, 1b 280 2: 281 jalr zero, 0x0(t0) /* Return to multibyte copy loop */ 282 283/* 284 * Simple byte copy loops. 285 * These will byte copy until they reach the end of data to copy. 286 * At that point, they will call to return from memmove. 287 */ 288byte_copy: 289 bltu a1, a0, byte_copy_reverse 290 291byte_copy_forward: 292 beq t3, t4, 2f 293 1: 294 lb t1, 0(a1) 295 addi a1, a1, 1 296 addi t3, t3, 1 297 sb t1, -1(t3) 298 bne t3, t4, 1b 299 2: 300 ret 301 302byte_copy_reverse: 303 beq t4, t3, 2f 304 1: 305 lb t1, -1(a4) 306 addi a4, a4, -1 307 addi t4, t4, -1 308 sb t1, 0(t4) 309 bne t4, t3, 1b 310 2: 311 312return_from_memmove: 313 ret 314 315SYM_FUNC_END(memmove) 316SYM_FUNC_END(__memmove)