lusercopy.S (8979B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * User Space Access Routines 4 * 5 * Copyright (C) 2000-2002 Hewlett-Packard (John Marvin) 6 * Copyright (C) 2000 Richard Hirst <rhirst with parisc-linux.org> 7 * Copyright (C) 2001 Matthieu Delahaye <delahaym at esiee.fr> 8 * Copyright (C) 2003 Randolph Chung <tausq with parisc-linux.org> 9 * Copyright (C) 2017 Helge Deller <deller@gmx.de> 10 * Copyright (C) 2017 John David Anglin <dave.anglin@bell.net> 11 */ 12 13/* 14 * These routines still have plenty of room for optimization 15 * (word & doubleword load/store, dual issue, store hints, etc.). 16 */ 17 18/* 19 * The following routines assume that space register 3 (sr3) contains 20 * the space id associated with the current users address space. 21 */ 22 23 24 .text 25 26#include <asm/assembly.h> 27#include <asm/errno.h> 28#include <linux/linkage.h> 29 30 /* 31 * unsigned long lclear_user(void *to, unsigned long n) 32 * 33 * Returns 0 for success. 34 * otherwise, returns number of bytes not transferred. 35 */ 36 37ENTRY_CFI(lclear_user) 38 comib,=,n 0,%r25,$lclu_done 39$lclu_loop: 40 addib,<> -1,%r25,$lclu_loop 411: stbs,ma %r0,1(%sr3,%r26) 42 43$lclu_done: 44 bv %r0(%r2) 45 copy %r25,%r28 46 472: b $lclu_done 48 ldo 1(%r25),%r25 49 50 ASM_EXCEPTIONTABLE_ENTRY(1b,2b) 51ENDPROC_CFI(lclear_user) 52 53 54/* 55 * unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len) 56 * 57 * Inputs: 58 * - sr1 already contains space of source region 59 * - sr2 already contains space of destination region 60 * 61 * Returns: 62 * - number of bytes that could not be copied. 63 * On success, this will be zero. 64 * 65 * This code is based on a C-implementation of a copy routine written by 66 * Randolph Chung, which in turn was derived from the glibc. 67 * 68 * Several strategies are tried to try to get the best performance for various 69 * conditions. In the optimal case, we copy by loops that copy 32- or 16-bytes 70 * at a time using general registers. Unaligned copies are handled either by 71 * aligning the destination and then using shift-and-write method, or in a few 72 * cases by falling back to a byte-at-a-time copy. 73 * 74 * Testing with various alignments and buffer sizes shows that this code is 75 * often >10x faster than a simple byte-at-a-time copy, even for strangely 76 * aligned operands. It is interesting to note that the glibc version of memcpy 77 * (written in C) is actually quite fast already. This routine is able to beat 78 * it by 30-40% for aligned copies because of the loop unrolling, but in some 79 * cases the glibc version is still slightly faster. This lends more 80 * credibility that gcc can generate very good code as long as we are careful. 81 * 82 * Possible optimizations: 83 * - add cache prefetching 84 * - try not to use the post-increment address modifiers; they may create 85 * additional interlocks. Assumption is that those were only efficient on old 86 * machines (pre PA8000 processors) 87 */ 88 89 dst = arg0 90 src = arg1 91 len = arg2 92 end = arg3 93 t1 = r19 94 t2 = r20 95 t3 = r21 96 t4 = r22 97 srcspc = sr1 98 dstspc = sr2 99 100 t0 = r1 101 a1 = t1 102 a2 = t2 103 a3 = t3 104 a0 = t4 105 106 save_src = ret0 107 save_dst = ret1 108 save_len = r31 109 110ENTRY_CFI(pa_memcpy) 111 /* Last destination address */ 112 add dst,len,end 113 114 /* short copy with less than 16 bytes? */ 115 cmpib,COND(>>=),n 15,len,.Lbyte_loop 116 117 /* same alignment? */ 118 xor src,dst,t0 119 extru t0,31,2,t1 120 cmpib,<>,n 0,t1,.Lunaligned_copy 121 122#ifdef CONFIG_64BIT 123 /* only do 64-bit copies if we can get aligned. */ 124 extru t0,31,3,t1 125 cmpib,<>,n 0,t1,.Lalign_loop32 126 127 /* loop until we are 64-bit aligned */ 128.Lalign_loop64: 129 extru dst,31,3,t1 130 cmpib,=,n 0,t1,.Lcopy_loop_16_start 13120: ldb,ma 1(srcspc,src),t1 13221: stb,ma t1,1(dstspc,dst) 133 b .Lalign_loop64 134 ldo -1(len),len 135 136 ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) 137 ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) 138 139.Lcopy_loop_16_start: 140 ldi 31,t0 141.Lcopy_loop_16: 142 cmpb,COND(>>=),n t0,len,.Lword_loop 143 14410: ldd 0(srcspc,src),t1 14511: ldd 8(srcspc,src),t2 146 ldo 16(src),src 14712: std,ma t1,8(dstspc,dst) 14813: std,ma t2,8(dstspc,dst) 14914: ldd 0(srcspc,src),t1 15015: ldd 8(srcspc,src),t2 151 ldo 16(src),src 15216: std,ma t1,8(dstspc,dst) 15317: std,ma t2,8(dstspc,dst) 154 155 ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) 156 ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy16_fault) 157 ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done) 158 ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done) 159 ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done) 160 ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy16_fault) 161 ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done) 162 ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done) 163 164 b .Lcopy_loop_16 165 ldo -32(len),len 166 167.Lword_loop: 168 cmpib,COND(>>=),n 3,len,.Lbyte_loop 16920: ldw,ma 4(srcspc,src),t1 17021: stw,ma t1,4(dstspc,dst) 171 b .Lword_loop 172 ldo -4(len),len 173 174 ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) 175 ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) 176 177#endif /* CONFIG_64BIT */ 178 179 /* loop until we are 32-bit aligned */ 180.Lalign_loop32: 181 extru dst,31,2,t1 182 cmpib,=,n 0,t1,.Lcopy_loop_8 18320: ldb,ma 1(srcspc,src),t1 18421: stb,ma t1,1(dstspc,dst) 185 b .Lalign_loop32 186 ldo -1(len),len 187 188 ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) 189 ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) 190 191 192.Lcopy_loop_8: 193 cmpib,COND(>>=),n 15,len,.Lbyte_loop 194 19510: ldw 0(srcspc,src),t1 19611: ldw 4(srcspc,src),t2 19712: stw,ma t1,4(dstspc,dst) 19813: stw,ma t2,4(dstspc,dst) 19914: ldw 8(srcspc,src),t1 20015: ldw 12(srcspc,src),t2 201 ldo 16(src),src 20216: stw,ma t1,4(dstspc,dst) 20317: stw,ma t2,4(dstspc,dst) 204 205 ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) 206 ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy8_fault) 207 ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done) 208 ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done) 209 ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done) 210 ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy8_fault) 211 ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done) 212 ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done) 213 214 b .Lcopy_loop_8 215 ldo -16(len),len 216 217.Lbyte_loop: 218 cmpclr,COND(<>) len,%r0,%r0 219 b,n .Lcopy_done 22020: ldb 0(srcspc,src),t1 221 ldo 1(src),src 22221: stb,ma t1,1(dstspc,dst) 223 b .Lbyte_loop 224 ldo -1(len),len 225 226 ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) 227 ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) 228 229.Lcopy_done: 230 bv %r0(%r2) 231 sub end,dst,ret0 232 233 234 /* src and dst are not aligned the same way. */ 235 /* need to go the hard way */ 236.Lunaligned_copy: 237 /* align until dst is 32bit-word-aligned */ 238 extru dst,31,2,t1 239 cmpib,=,n 0,t1,.Lcopy_dstaligned 24020: ldb 0(srcspc,src),t1 241 ldo 1(src),src 24221: stb,ma t1,1(dstspc,dst) 243 b .Lunaligned_copy 244 ldo -1(len),len 245 246 ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) 247 ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) 248 249.Lcopy_dstaligned: 250 251 /* store src, dst and len in safe place */ 252 copy src,save_src 253 copy dst,save_dst 254 copy len,save_len 255 256 /* len now needs give number of words to copy */ 257 SHRREG len,2,len 258 259 /* 260 * Copy from a not-aligned src to an aligned dst using shifts. 261 * Handles 4 words per loop. 262 */ 263 264 depw,z src,28,2,t0 265 subi 32,t0,t0 266 mtsar t0 267 extru len,31,2,t0 268 cmpib,= 2,t0,.Lcase2 269 /* Make src aligned by rounding it down. */ 270 depi 0,31,2,src 271 272 cmpiclr,<> 3,t0,%r0 273 b,n .Lcase3 274 cmpiclr,<> 1,t0,%r0 275 b,n .Lcase1 276.Lcase0: 277 cmpb,COND(=) %r0,len,.Lcda_finish 278 nop 279 2801: ldw,ma 4(srcspc,src), a3 281 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 2821: ldw,ma 4(srcspc,src), a0 283 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 284 b,n .Ldo3 285.Lcase1: 2861: ldw,ma 4(srcspc,src), a2 287 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 2881: ldw,ma 4(srcspc,src), a3 289 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 290 ldo -1(len),len 291 cmpb,COND(=),n %r0,len,.Ldo0 292.Ldo4: 2931: ldw,ma 4(srcspc,src), a0 294 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 295 shrpw a2, a3, %sar, t0 2961: stw,ma t0, 4(dstspc,dst) 297 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) 298.Ldo3: 2991: ldw,ma 4(srcspc,src), a1 300 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 301 shrpw a3, a0, %sar, t0 3021: stw,ma t0, 4(dstspc,dst) 303 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) 304.Ldo2: 3051: ldw,ma 4(srcspc,src), a2 306 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 307 shrpw a0, a1, %sar, t0 3081: stw,ma t0, 4(dstspc,dst) 309 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) 310.Ldo1: 3111: ldw,ma 4(srcspc,src), a3 312 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 313 shrpw a1, a2, %sar, t0 3141: stw,ma t0, 4(dstspc,dst) 315 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) 316 ldo -4(len),len 317 cmpb,COND(<>) %r0,len,.Ldo4 318 nop 319.Ldo0: 320 shrpw a2, a3, %sar, t0 3211: stw,ma t0, 4(dstspc,dst) 322 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) 323 324.Lcda_rdfault: 325.Lcda_finish: 326 /* calculate new src, dst and len and jump to byte-copy loop */ 327 sub dst,save_dst,t0 328 add save_src,t0,src 329 b .Lbyte_loop 330 sub save_len,t0,len 331 332.Lcase3: 3331: ldw,ma 4(srcspc,src), a0 334 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 3351: ldw,ma 4(srcspc,src), a1 336 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 337 b .Ldo2 338 ldo 1(len),len 339.Lcase2: 3401: ldw,ma 4(srcspc,src), a1 341 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 3421: ldw,ma 4(srcspc,src), a2 343 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 344 b .Ldo1 345 ldo 2(len),len 346 347 348 /* fault exception fixup handlers: */ 349#ifdef CONFIG_64BIT 350.Lcopy16_fault: 351 b .Lcopy_done 35210: std,ma t1,8(dstspc,dst) 353 ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) 354#endif 355 356.Lcopy8_fault: 357 b .Lcopy_done 35810: stw,ma t1,4(dstspc,dst) 359 ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) 360ENDPROC_CFI(pa_memcpy) 361 362 .end