memset.S (4659B)
1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright (c) 2011, The Linux Foundation. All rights reserved. 4 */ 5 6 7/* HEXAGON assembly optimized memset */ 8/* Replaces the standard library function memset */ 9 10 11 .macro HEXAGON_OPT_FUNC_BEGIN name 12 .text 13 .p2align 4 14 .globl \name 15 .type \name, @function 16\name: 17 .endm 18 19 .macro HEXAGON_OPT_FUNC_FINISH name 20 .size \name, . - \name 21 .endm 22 23/* FUNCTION: memset (v2 version) */ 24#if __HEXAGON_ARCH__ < 3 25HEXAGON_OPT_FUNC_BEGIN memset 26 { 27 r6 = #8 28 r7 = extractu(r0, #3 , #0) 29 p0 = cmp.eq(r2, #0) 30 p1 = cmp.gtu(r2, #7) 31 } 32 { 33 r4 = vsplatb(r1) 34 r8 = r0 /* leave r0 intact for return val */ 35 r9 = sub(r6, r7) /* bytes until double alignment */ 36 if p0 jumpr r31 /* count == 0, so return */ 37 } 38 { 39 r3 = #0 40 r7 = #0 41 p0 = tstbit(r9, #0) 42 if p1 jump 2f /* skip byte loop */ 43 } 44 45/* less than 8 bytes to set, so just set a byte at a time and return */ 46 47 loop0(1f, r2) /* byte loop */ 48 .falign 491: /* byte loop */ 50 { 51 memb(r8++#1) = r4 52 }:endloop0 53 jumpr r31 54 .falign 552: /* skip byte loop */ 56 { 57 r6 = #1 58 p0 = tstbit(r9, #1) 59 p1 = cmp.eq(r2, #1) 60 if !p0 jump 3f /* skip initial byte store */ 61 } 62 { 63 memb(r8++#1) = r4 64 r3:2 = sub(r3:2, r7:6) 65 if p1 jumpr r31 66 } 67 .falign 683: /* skip initial byte store */ 69 { 70 r6 = #2 71 p0 = tstbit(r9, #2) 72 p1 = cmp.eq(r2, #2) 73 if !p0 jump 4f /* skip initial half store */ 74 } 75 { 76 memh(r8++#2) = r4 77 r3:2 = sub(r3:2, r7:6) 78 if p1 jumpr r31 79 } 80 .falign 814: /* skip initial half store */ 82 { 83 r6 = #4 84 p0 = cmp.gtu(r2, #7) 85 p1 = cmp.eq(r2, #4) 86 if !p0 jump 5f /* skip initial word store */ 87 } 88 { 89 memw(r8++#4) = r4 90 r3:2 = sub(r3:2, r7:6) 91 p0 = cmp.gtu(r2, #11) 92 if p1 jumpr r31 93 } 94 .falign 955: /* skip initial word store */ 96 { 97 r10 = lsr(r2, #3) 98 p1 = cmp.eq(r3, #1) 99 if !p0 jump 7f /* skip double loop */ 100 } 101 { 102 r5 = r4 103 r6 = #8 104 loop0(6f, r10) /* double loop */ 105 } 106 107/* set bytes a double word at a time */ 108 109 .falign 1106: /* double loop */ 111 { 112 memd(r8++#8) = r5:4 113 r3:2 = sub(r3:2, r7:6) 114 p1 = cmp.eq(r2, #8) 115 }:endloop0 116 .falign 1177: /* skip double loop */ 118 { 119 p0 = tstbit(r2, #2) 120 if p1 jumpr r31 121 } 122 { 123 r6 = #4 124 p0 = tstbit(r2, #1) 125 p1 = cmp.eq(r2, #4) 126 if !p0 jump 8f /* skip final word store */ 127 } 128 { 129 memw(r8++#4) = r4 130 r3:2 = sub(r3:2, r7:6) 131 if p1 jumpr r31 132 } 133 .falign 1348: /* skip final word store */ 135 { 136 p1 = cmp.eq(r2, #2) 137 if !p0 jump 9f /* skip final half store */ 138 } 139 { 140 memh(r8++#2) = r4 141 if p1 jumpr r31 142 } 143 .falign 1449: /* skip final half store */ 145 { 146 memb(r8++#1) = r4 147 jumpr r31 148 } 149HEXAGON_OPT_FUNC_FINISH memset 150#endif 151 152 153/* FUNCTION: memset (v3 and higher version) */ 154#if __HEXAGON_ARCH__ >= 3 155HEXAGON_OPT_FUNC_BEGIN memset 156 { 157 r7=vsplatb(r1) 158 r6 = r0 159 if (r2==#0) jump:nt .L1 160 } 161 { 162 r5:4=combine(r7,r7) 163 p0 = cmp.gtu(r2,#8) 164 if (p0.new) jump:nt .L3 165 } 166 { 167 r3 = r0 168 loop0(.L47,r2) 169 } 170 .falign 171.L47: 172 { 173 memb(r3++#1) = r1 174 }:endloop0 /* start=.L47 */ 175 jumpr r31 176.L3: 177 { 178 p0 = tstbit(r0,#0) 179 if (!p0.new) jump:nt .L8 180 p1 = cmp.eq(r2, #1) 181 } 182 { 183 r6 = add(r0, #1) 184 r2 = add(r2,#-1) 185 memb(r0) = r1 186 if (p1) jump .L1 187 } 188.L8: 189 { 190 p0 = tstbit(r6,#1) 191 if (!p0.new) jump:nt .L10 192 } 193 { 194 r2 = add(r2,#-2) 195 memh(r6++#2) = r7 196 p0 = cmp.eq(r2, #2) 197 if (p0.new) jump:nt .L1 198 } 199.L10: 200 { 201 p0 = tstbit(r6,#2) 202 if (!p0.new) jump:nt .L12 203 } 204 { 205 r2 = add(r2,#-4) 206 memw(r6++#4) = r7 207 p0 = cmp.eq(r2, #4) 208 if (p0.new) jump:nt .L1 209 } 210.L12: 211 { 212 p0 = cmp.gtu(r2,#127) 213 if (!p0.new) jump:nt .L14 214 } 215 r3 = and(r6,#31) 216 if (r3==#0) jump:nt .L17 217 { 218 memd(r6++#8) = r5:4 219 r2 = add(r2,#-8) 220 } 221 r3 = and(r6,#31) 222 if (r3==#0) jump:nt .L17 223 { 224 memd(r6++#8) = r5:4 225 r2 = add(r2,#-8) 226 } 227 r3 = and(r6,#31) 228 if (r3==#0) jump:nt .L17 229 { 230 memd(r6++#8) = r5:4 231 r2 = add(r2,#-8) 232 } 233.L17: 234 { 235 r3 = lsr(r2,#5) 236 if (r1!=#0) jump:nt .L18 237 } 238 { 239 r8 = r3 240 r3 = r6 241 loop0(.L46,r3) 242 } 243 .falign 244.L46: 245 { 246 dczeroa(r6) 247 r6 = add(r6,#32) 248 r2 = add(r2,#-32) 249 }:endloop0 /* start=.L46 */ 250.L14: 251 { 252 p0 = cmp.gtu(r2,#7) 253 if (!p0.new) jump:nt .L28 254 r8 = lsr(r2,#3) 255 } 256 loop0(.L44,r8) 257 .falign 258.L44: 259 { 260 memd(r6++#8) = r5:4 261 r2 = add(r2,#-8) 262 }:endloop0 /* start=.L44 */ 263.L28: 264 { 265 p0 = tstbit(r2,#2) 266 if (!p0.new) jump:nt .L33 267 } 268 { 269 r2 = add(r2,#-4) 270 memw(r6++#4) = r7 271 } 272.L33: 273 { 274 p0 = tstbit(r2,#1) 275 if (!p0.new) jump:nt .L35 276 } 277 { 278 r2 = add(r2,#-2) 279 memh(r6++#2) = r7 280 } 281.L35: 282 p0 = cmp.eq(r2,#1) 283 if (p0) memb(r6) = r1 284.L1: 285 jumpr r31 286.L18: 287 loop0(.L45,r3) 288 .falign 289.L45: 290 dczeroa(r6) 291 { 292 memd(r6++#8) = r5:4 293 r2 = add(r2,#-32) 294 } 295 memd(r6++#8) = r5:4 296 memd(r6++#8) = r5:4 297 { 298 memd(r6++#8) = r5:4 299 }:endloop0 /* start=.L45 */ 300 jump .L14 301HEXAGON_OPT_FUNC_FINISH memset 302#endif