xor-neon.c (9011B)
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * arch/arm64/lib/xor-neon.c 4 * 5 * Authors: Jackie Liu <liuyun01@kylinos.cn> 6 * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. 7 */ 8 9#include <linux/raid/xor.h> 10#include <linux/module.h> 11#include <asm/neon-intrinsics.h> 12 13void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1, 14 const unsigned long * __restrict p2) 15{ 16 uint64_t *dp1 = (uint64_t *)p1; 17 uint64_t *dp2 = (uint64_t *)p2; 18 19 register uint64x2_t v0, v1, v2, v3; 20 long lines = bytes / (sizeof(uint64x2_t) * 4); 21 22 do { 23 /* p1 ^= p2 */ 24 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 25 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 26 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 27 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 28 29 /* store */ 30 vst1q_u64(dp1 + 0, v0); 31 vst1q_u64(dp1 + 2, v1); 32 vst1q_u64(dp1 + 4, v2); 33 vst1q_u64(dp1 + 6, v3); 34 35 dp1 += 8; 36 dp2 += 8; 37 } while (--lines > 0); 38} 39 40void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1, 41 const unsigned long * __restrict p2, 42 const unsigned long * __restrict p3) 43{ 44 uint64_t *dp1 = (uint64_t *)p1; 45 uint64_t *dp2 = (uint64_t *)p2; 46 uint64_t *dp3 = (uint64_t *)p3; 47 48 register uint64x2_t v0, v1, v2, v3; 49 long lines = bytes / (sizeof(uint64x2_t) * 4); 50 51 do { 52 /* p1 ^= p2 */ 53 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 54 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 55 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 56 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 57 58 /* p1 ^= p3 */ 59 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 60 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 61 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 62 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 63 64 /* store */ 65 vst1q_u64(dp1 + 0, v0); 66 vst1q_u64(dp1 + 2, v1); 67 vst1q_u64(dp1 + 4, v2); 68 vst1q_u64(dp1 + 6, v3); 69 70 dp1 += 8; 71 dp2 += 8; 72 dp3 += 8; 73 } while (--lines > 0); 74} 75 76void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1, 77 const unsigned long * __restrict p2, 78 const unsigned long * __restrict p3, 79 const unsigned long * __restrict p4) 80{ 81 uint64_t *dp1 = (uint64_t *)p1; 82 uint64_t *dp2 = (uint64_t *)p2; 83 uint64_t *dp3 = (uint64_t *)p3; 84 uint64_t *dp4 = (uint64_t *)p4; 85 86 register uint64x2_t v0, v1, v2, v3; 87 long lines = bytes / (sizeof(uint64x2_t) * 4); 88 89 do { 90 /* p1 ^= p2 */ 91 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 92 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 93 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 94 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 95 96 /* p1 ^= p3 */ 97 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 98 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 99 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 100 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 101 102 /* p1 ^= p4 */ 103 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 104 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 105 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 106 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 107 108 /* store */ 109 vst1q_u64(dp1 + 0, v0); 110 vst1q_u64(dp1 + 2, v1); 111 vst1q_u64(dp1 + 4, v2); 112 vst1q_u64(dp1 + 6, v3); 113 114 dp1 += 8; 115 dp2 += 8; 116 dp3 += 8; 117 dp4 += 8; 118 } while (--lines > 0); 119} 120 121void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1, 122 const unsigned long * __restrict p2, 123 const unsigned long * __restrict p3, 124 const unsigned long * __restrict p4, 125 const unsigned long * __restrict p5) 126{ 127 uint64_t *dp1 = (uint64_t *)p1; 128 uint64_t *dp2 = (uint64_t *)p2; 129 uint64_t *dp3 = (uint64_t *)p3; 130 uint64_t *dp4 = (uint64_t *)p4; 131 uint64_t *dp5 = (uint64_t *)p5; 132 133 register uint64x2_t v0, v1, v2, v3; 134 long lines = bytes / (sizeof(uint64x2_t) * 4); 135 136 do { 137 /* p1 ^= p2 */ 138 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 139 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 140 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 141 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 142 143 /* p1 ^= p3 */ 144 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 145 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 146 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 147 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 148 149 /* p1 ^= p4 */ 150 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 151 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 152 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 153 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 154 155 /* p1 ^= p5 */ 156 v0 = veorq_u64(v0, vld1q_u64(dp5 + 0)); 157 v1 = veorq_u64(v1, vld1q_u64(dp5 + 2)); 158 v2 = veorq_u64(v2, vld1q_u64(dp5 + 4)); 159 v3 = veorq_u64(v3, vld1q_u64(dp5 + 6)); 160 161 /* store */ 162 vst1q_u64(dp1 + 0, v0); 163 vst1q_u64(dp1 + 2, v1); 164 vst1q_u64(dp1 + 4, v2); 165 vst1q_u64(dp1 + 6, v3); 166 167 dp1 += 8; 168 dp2 += 8; 169 dp3 += 8; 170 dp4 += 8; 171 dp5 += 8; 172 } while (--lines > 0); 173} 174 175struct xor_block_template xor_block_inner_neon __ro_after_init = { 176 .name = "__inner_neon__", 177 .do_2 = xor_arm64_neon_2, 178 .do_3 = xor_arm64_neon_3, 179 .do_4 = xor_arm64_neon_4, 180 .do_5 = xor_arm64_neon_5, 181}; 182EXPORT_SYMBOL(xor_block_inner_neon); 183 184static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) 185{ 186 uint64x2_t res; 187 188 asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" 189 "eor3 %0.16b, %1.16b, %2.16b, %3.16b" 190 : "=w"(res) : "w"(p), "w"(q), "w"(r)); 191 return res; 192} 193 194static void xor_arm64_eor3_3(unsigned long bytes, 195 unsigned long * __restrict p1, 196 const unsigned long * __restrict p2, 197 const unsigned long * __restrict p3) 198{ 199 uint64_t *dp1 = (uint64_t *)p1; 200 uint64_t *dp2 = (uint64_t *)p2; 201 uint64_t *dp3 = (uint64_t *)p3; 202 203 register uint64x2_t v0, v1, v2, v3; 204 long lines = bytes / (sizeof(uint64x2_t) * 4); 205 206 do { 207 /* p1 ^= p2 ^ p3 */ 208 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 209 vld1q_u64(dp3 + 0)); 210 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 211 vld1q_u64(dp3 + 2)); 212 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 213 vld1q_u64(dp3 + 4)); 214 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 215 vld1q_u64(dp3 + 6)); 216 217 /* store */ 218 vst1q_u64(dp1 + 0, v0); 219 vst1q_u64(dp1 + 2, v1); 220 vst1q_u64(dp1 + 4, v2); 221 vst1q_u64(dp1 + 6, v3); 222 223 dp1 += 8; 224 dp2 += 8; 225 dp3 += 8; 226 } while (--lines > 0); 227} 228 229static void xor_arm64_eor3_4(unsigned long bytes, 230 unsigned long * __restrict p1, 231 const unsigned long * __restrict p2, 232 const unsigned long * __restrict p3, 233 const unsigned long * __restrict p4) 234{ 235 uint64_t *dp1 = (uint64_t *)p1; 236 uint64_t *dp2 = (uint64_t *)p2; 237 uint64_t *dp3 = (uint64_t *)p3; 238 uint64_t *dp4 = (uint64_t *)p4; 239 240 register uint64x2_t v0, v1, v2, v3; 241 long lines = bytes / (sizeof(uint64x2_t) * 4); 242 243 do { 244 /* p1 ^= p2 ^ p3 */ 245 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 246 vld1q_u64(dp3 + 0)); 247 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 248 vld1q_u64(dp3 + 2)); 249 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 250 vld1q_u64(dp3 + 4)); 251 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 252 vld1q_u64(dp3 + 6)); 253 254 /* p1 ^= p4 */ 255 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 256 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 257 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 258 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 259 260 /* store */ 261 vst1q_u64(dp1 + 0, v0); 262 vst1q_u64(dp1 + 2, v1); 263 vst1q_u64(dp1 + 4, v2); 264 vst1q_u64(dp1 + 6, v3); 265 266 dp1 += 8; 267 dp2 += 8; 268 dp3 += 8; 269 dp4 += 8; 270 } while (--lines > 0); 271} 272 273static void xor_arm64_eor3_5(unsigned long bytes, 274 unsigned long * __restrict p1, 275 const unsigned long * __restrict p2, 276 const unsigned long * __restrict p3, 277 const unsigned long * __restrict p4, 278 const unsigned long * __restrict p5) 279{ 280 uint64_t *dp1 = (uint64_t *)p1; 281 uint64_t *dp2 = (uint64_t *)p2; 282 uint64_t *dp3 = (uint64_t *)p3; 283 uint64_t *dp4 = (uint64_t *)p4; 284 uint64_t *dp5 = (uint64_t *)p5; 285 286 register uint64x2_t v0, v1, v2, v3; 287 long lines = bytes / (sizeof(uint64x2_t) * 4); 288 289 do { 290 /* p1 ^= p2 ^ p3 */ 291 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 292 vld1q_u64(dp3 + 0)); 293 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 294 vld1q_u64(dp3 + 2)); 295 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 296 vld1q_u64(dp3 + 4)); 297 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 298 vld1q_u64(dp3 + 6)); 299 300 /* p1 ^= p4 ^ p5 */ 301 v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0)); 302 v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2)); 303 v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4)); 304 v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6)); 305 306 /* store */ 307 vst1q_u64(dp1 + 0, v0); 308 vst1q_u64(dp1 + 2, v1); 309 vst1q_u64(dp1 + 4, v2); 310 vst1q_u64(dp1 + 6, v3); 311 312 dp1 += 8; 313 dp2 += 8; 314 dp3 += 8; 315 dp4 += 8; 316 dp5 += 8; 317 } while (--lines > 0); 318} 319 320static int __init xor_neon_init(void) 321{ 322 if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) { 323 xor_block_inner_neon.do_3 = xor_arm64_eor3_3; 324 xor_block_inner_neon.do_4 = xor_arm64_eor3_4; 325 xor_block_inner_neon.do_5 = xor_arm64_eor3_5; 326 } 327 return 0; 328} 329module_init(xor_neon_init); 330 331static void __exit xor_neon_exit(void) 332{ 333} 334module_exit(xor_neon_exit); 335 336MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>"); 337MODULE_DESCRIPTION("ARMv8 XOR Extensions"); 338MODULE_LICENSE("GPL");