strncmp.S (9205B)
1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright (c) 2013-2022, Arm Limited. 4 * 5 * Adapted from the original at: 6 * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S 7 */ 8 9#include <linux/linkage.h> 10#include <asm/assembler.h> 11 12/* Assumptions: 13 * 14 * ARMv8-a, AArch64. 15 * MTE compatible. 16 */ 17 18#define L(label) .L ## label 19 20#define REP8_01 0x0101010101010101 21#define REP8_7f 0x7f7f7f7f7f7f7f7f 22 23/* Parameters and result. */ 24#define src1 x0 25#define src2 x1 26#define limit x2 27#define result x0 28 29/* Internal variables. */ 30#define data1 x3 31#define data1w w3 32#define data2 x4 33#define data2w w4 34#define has_nul x5 35#define diff x6 36#define syndrome x7 37#define tmp1 x8 38#define tmp2 x9 39#define tmp3 x10 40#define zeroones x11 41#define pos x12 42#define mask x13 43#define endloop x14 44#define count mask 45#define offset pos 46#define neg_offset x15 47 48/* Define endian dependent shift operations. 49 On big-endian early bytes are at MSB and on little-endian LSB. 50 LS_FW means shifting towards early bytes. 51 LS_BK means shifting towards later bytes. 52 */ 53#ifdef __AARCH64EB__ 54#define LS_FW lsl 55#define LS_BK lsr 56#else 57#define LS_FW lsr 58#define LS_BK lsl 59#endif 60 61SYM_FUNC_START(__pi_strncmp) 62 cbz limit, L(ret0) 63 eor tmp1, src1, src2 64 mov zeroones, #REP8_01 65 tst tmp1, #7 66 and count, src1, #7 67 b.ne L(misaligned8) 68 cbnz count, L(mutual_align) 69 70 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 71 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 72 can be done in parallel across the entire word. */ 73 .p2align 4 74L(loop_aligned): 75 ldr data1, [src1], #8 76 ldr data2, [src2], #8 77L(start_realigned): 78 subs limit, limit, #8 79 sub tmp1, data1, zeroones 80 orr tmp2, data1, #REP8_7f 81 eor diff, data1, data2 /* Non-zero if differences found. */ 82 csinv endloop, diff, xzr, hi /* Last Dword or differences. */ 83 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 84 ccmp endloop, #0, #0, eq 85 b.eq L(loop_aligned) 86 /* End of main loop */ 87 88L(full_check): 89#ifndef __AARCH64EB__ 90 orr syndrome, diff, has_nul 91 add limit, limit, 8 /* Rewind limit to before last subs. */ 92L(syndrome_check): 93 /* Limit was reached. Check if the NUL byte or the difference 94 is before the limit. */ 95 rev syndrome, syndrome 96 rev data1, data1 97 clz pos, syndrome 98 rev data2, data2 99 lsl data1, data1, pos 100 cmp limit, pos, lsr #3 101 lsl data2, data2, pos 102 /* But we need to zero-extend (char is unsigned) the value and then 103 perform a signed 32-bit subtraction. */ 104 lsr data1, data1, #56 105 sub result, data1, data2, lsr #56 106 csel result, result, xzr, hi 107 ret 108#else 109 /* Not reached the limit, must have found the end or a diff. */ 110 tbz limit, #63, L(not_limit) 111 add tmp1, limit, 8 112 cbz limit, L(not_limit) 113 114 lsl limit, tmp1, #3 /* Bits -> bytes. */ 115 mov mask, #~0 116 lsr mask, mask, limit 117 bic data1, data1, mask 118 bic data2, data2, mask 119 120 /* Make sure that the NUL byte is marked in the syndrome. */ 121 orr has_nul, has_nul, mask 122 123L(not_limit): 124 /* For big-endian we cannot use the trick with the syndrome value 125 as carry-propagation can corrupt the upper bits if the trailing 126 bytes in the string contain 0x01. */ 127 /* However, if there is no NUL byte in the dword, we can generate 128 the result directly. We can't just subtract the bytes as the 129 MSB might be significant. */ 130 cbnz has_nul, 1f 131 cmp data1, data2 132 cset result, ne 133 cneg result, result, lo 134 ret 1351: 136 /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 137 rev tmp3, data1 138 sub tmp1, tmp3, zeroones 139 orr tmp2, tmp3, #REP8_7f 140 bic has_nul, tmp1, tmp2 141 rev has_nul, has_nul 142 orr syndrome, diff, has_nul 143 clz pos, syndrome 144 /* The most-significant-non-zero bit of the syndrome marks either the 145 first bit that is different, or the top bit of the first zero byte. 146 Shifting left now will bring the critical information into the 147 top bits. */ 148L(end_quick): 149 lsl data1, data1, pos 150 lsl data2, data2, pos 151 /* But we need to zero-extend (char is unsigned) the value and then 152 perform a signed 32-bit subtraction. */ 153 lsr data1, data1, #56 154 sub result, data1, data2, lsr #56 155 ret 156#endif 157 158L(mutual_align): 159 /* Sources are mutually aligned, but are not currently at an 160 alignment boundary. Round down the addresses and then mask off 161 the bytes that precede the start point. 162 We also need to adjust the limit calculations, but without 163 overflowing if the limit is near ULONG_MAX. */ 164 bic src1, src1, #7 165 bic src2, src2, #7 166 ldr data1, [src1], #8 167 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ 168 ldr data2, [src2], #8 169 mov tmp2, #~0 170 LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ 171 /* Adjust the limit and ensure it doesn't overflow. */ 172 adds limit, limit, count 173 csinv limit, limit, xzr, lo 174 orr data1, data1, tmp2 175 orr data2, data2, tmp2 176 b L(start_realigned) 177 178 .p2align 4 179 /* Don't bother with dwords for up to 16 bytes. */ 180L(misaligned8): 181 cmp limit, #16 182 b.hs L(try_misaligned_words) 183 184L(byte_loop): 185 /* Perhaps we can do better than this. */ 186 ldrb data1w, [src1], #1 187 ldrb data2w, [src2], #1 188 subs limit, limit, #1 189 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ 190 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 191 b.eq L(byte_loop) 192L(done): 193 sub result, data1, data2 194 ret 195 /* Align the SRC1 to a dword by doing a bytewise compare and then do 196 the dword loop. */ 197L(try_misaligned_words): 198 cbz count, L(src1_aligned) 199 200 neg count, count 201 and count, count, #7 202 sub limit, limit, count 203 204L(page_end_loop): 205 ldrb data1w, [src1], #1 206 ldrb data2w, [src2], #1 207 cmp data1w, #1 208 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 209 b.ne L(done) 210 subs count, count, #1 211 b.hi L(page_end_loop) 212 213 /* The following diagram explains the comparison of misaligned strings. 214 The bytes are shown in natural order. For little-endian, it is 215 reversed in the registers. The "x" bytes are before the string. 216 The "|" separates data that is loaded at one time. 217 src1 | a a a a a a a a | b b b c c c c c | . . . 218 src2 | x x x x x a a a a a a a a b b b | c c c c c . . . 219 220 After shifting in each step, the data looks like this: 221 STEP_A STEP_B STEP_C 222 data1 a a a a a a a a b b b c c c c c b b b c c c c c 223 data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c 224 225 The bytes with "0" are eliminated from the syndrome via mask. 226 227 Align SRC2 down to 16 bytes. This way we can read 16 bytes at a 228 time from SRC2. The comparison happens in 3 steps. After each step 229 the loop can exit, or read from SRC1 or SRC2. */ 230L(src1_aligned): 231 /* Calculate offset from 8 byte alignment to string start in bits. No 232 need to mask offset since shifts are ignoring upper bits. */ 233 lsl offset, src2, #3 234 bic src2, src2, #0xf 235 mov mask, -1 236 neg neg_offset, offset 237 ldr data1, [src1], #8 238 ldp tmp1, tmp2, [src2], #16 239 LS_BK mask, mask, neg_offset 240 and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ 241 /* Skip the first compare if data in tmp1 is irrelevant. */ 242 tbnz offset, 6, L(misaligned_mid_loop) 243 244L(loop_misaligned): 245 /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ 246 LS_FW data2, tmp1, offset 247 LS_BK tmp1, tmp2, neg_offset 248 subs limit, limit, #8 249 orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ 250 sub has_nul, data1, zeroones 251 eor diff, data1, data2 /* Non-zero if differences found. */ 252 orr tmp3, data1, #REP8_7f 253 csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ 254 bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ 255 orr tmp3, endloop, has_nul 256 cbnz tmp3, L(full_check) 257 258 ldr data1, [src1], #8 259L(misaligned_mid_loop): 260 /* STEP_B: Compare first part of data1 to second part of tmp2. */ 261 LS_FW data2, tmp2, offset 262#ifdef __AARCH64EB__ 263 /* For big-endian we do a byte reverse to avoid carry-propagation 264 problem described above. This way we can reuse the has_nul in the 265 next step and also use syndrome value trick at the end. */ 266 rev tmp3, data1 267 #define data1_fixed tmp3 268#else 269 #define data1_fixed data1 270#endif 271 sub has_nul, data1_fixed, zeroones 272 orr tmp3, data1_fixed, #REP8_7f 273 eor diff, data2, data1 /* Non-zero if differences found. */ 274 bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ 275#ifdef __AARCH64EB__ 276 rev has_nul, has_nul 277#endif 278 cmp limit, neg_offset, lsr #3 279 orr syndrome, diff, has_nul 280 bic syndrome, syndrome, mask /* Ignore later bytes. */ 281 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 282 cbnz tmp3, L(syndrome_check) 283 284 /* STEP_C: Compare second part of data1 to first part of tmp1. */ 285 ldp tmp1, tmp2, [src2], #16 286 cmp limit, #8 287 LS_BK data2, tmp1, neg_offset 288 eor diff, data2, data1 /* Non-zero if differences found. */ 289 orr syndrome, diff, has_nul 290 and syndrome, syndrome, mask /* Ignore earlier bytes. */ 291 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 292 cbnz tmp3, L(syndrome_check) 293 294 ldr data1, [src1], #8 295 sub limit, limit, #8 296 b L(loop_misaligned) 297 298#ifdef __AARCH64EB__ 299L(syndrome_check): 300 clz pos, syndrome 301 cmp pos, limit, lsl #3 302 b.lo L(end_quick) 303#endif 304 305L(ret0): 306 mov result, #0 307 ret 308SYM_FUNC_END(__pi_strncmp) 309SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp) 310EXPORT_SYMBOL_NOKASAN(strncmp)