nh-neon-core.S (2174B)
1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * NH - ε-almost-universal hash function, ARM64 NEON accelerated version 4 * 5 * Copyright 2018 Google LLC 6 * 7 * Author: Eric Biggers <ebiggers@google.com> 8 */ 9 10#include <linux/linkage.h> 11 12 KEY .req x0 13 MESSAGE .req x1 14 MESSAGE_LEN .req x2 15 HASH .req x3 16 17 PASS0_SUMS .req v0 18 PASS1_SUMS .req v1 19 PASS2_SUMS .req v2 20 PASS3_SUMS .req v3 21 K0 .req v4 22 K1 .req v5 23 K2 .req v6 24 K3 .req v7 25 T0 .req v8 26 T1 .req v9 27 T2 .req v10 28 T3 .req v11 29 T4 .req v12 30 T5 .req v13 31 T6 .req v14 32 T7 .req v15 33 34.macro _nh_stride k0, k1, k2, k3 35 36 // Load next message stride 37 ld1 {T3.16b}, [MESSAGE], #16 38 39 // Load next key stride 40 ld1 {\k3\().4s}, [KEY], #16 41 42 // Add message words to key words 43 add T0.4s, T3.4s, \k0\().4s 44 add T1.4s, T3.4s, \k1\().4s 45 add T2.4s, T3.4s, \k2\().4s 46 add T3.4s, T3.4s, \k3\().4s 47 48 // Multiply 32x32 => 64 and accumulate 49 mov T4.d[0], T0.d[1] 50 mov T5.d[0], T1.d[1] 51 mov T6.d[0], T2.d[1] 52 mov T7.d[0], T3.d[1] 53 umlal PASS0_SUMS.2d, T0.2s, T4.2s 54 umlal PASS1_SUMS.2d, T1.2s, T5.2s 55 umlal PASS2_SUMS.2d, T2.2s, T6.2s 56 umlal PASS3_SUMS.2d, T3.2s, T7.2s 57.endm 58 59/* 60 * void nh_neon(const u32 *key, const u8 *message, size_t message_len, 61 * u8 hash[NH_HASH_BYTES]) 62 * 63 * It's guaranteed that message_len % 16 == 0. 64 */ 65SYM_FUNC_START(nh_neon) 66 67 ld1 {K0.4s,K1.4s}, [KEY], #32 68 movi PASS0_SUMS.2d, #0 69 movi PASS1_SUMS.2d, #0 70 ld1 {K2.4s}, [KEY], #16 71 movi PASS2_SUMS.2d, #0 72 movi PASS3_SUMS.2d, #0 73 74 subs MESSAGE_LEN, MESSAGE_LEN, #64 75 blt .Lloop4_done 76.Lloop4: 77 _nh_stride K0, K1, K2, K3 78 _nh_stride K1, K2, K3, K0 79 _nh_stride K2, K3, K0, K1 80 _nh_stride K3, K0, K1, K2 81 subs MESSAGE_LEN, MESSAGE_LEN, #64 82 bge .Lloop4 83 84.Lloop4_done: 85 ands MESSAGE_LEN, MESSAGE_LEN, #63 86 beq .Ldone 87 _nh_stride K0, K1, K2, K3 88 89 subs MESSAGE_LEN, MESSAGE_LEN, #16 90 beq .Ldone 91 _nh_stride K1, K2, K3, K0 92 93 subs MESSAGE_LEN, MESSAGE_LEN, #16 94 beq .Ldone 95 _nh_stride K2, K3, K0, K1 96 97.Ldone: 98 // Sum the accumulators for each pass, then store the sums to 'hash' 99 addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d 100 addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d 101 st1 {T0.16b,T1.16b}, [HASH] 102 ret 103SYM_FUNC_END(nh_neon)