nh-neon-core.S (2338B)
1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * NH - ε-almost-universal hash function, NEON accelerated version 4 * 5 * Copyright 2018 Google LLC 6 * 7 * Author: Eric Biggers <ebiggers@google.com> 8 */ 9 10#include <linux/linkage.h> 11 12 .text 13 .fpu neon 14 15 KEY .req r0 16 MESSAGE .req r1 17 MESSAGE_LEN .req r2 18 HASH .req r3 19 20 PASS0_SUMS .req q0 21 PASS0_SUM_A .req d0 22 PASS0_SUM_B .req d1 23 PASS1_SUMS .req q1 24 PASS1_SUM_A .req d2 25 PASS1_SUM_B .req d3 26 PASS2_SUMS .req q2 27 PASS2_SUM_A .req d4 28 PASS2_SUM_B .req d5 29 PASS3_SUMS .req q3 30 PASS3_SUM_A .req d6 31 PASS3_SUM_B .req d7 32 K0 .req q4 33 K1 .req q5 34 K2 .req q6 35 K3 .req q7 36 T0 .req q8 37 T0_L .req d16 38 T0_H .req d17 39 T1 .req q9 40 T1_L .req d18 41 T1_H .req d19 42 T2 .req q10 43 T2_L .req d20 44 T2_H .req d21 45 T3 .req q11 46 T3_L .req d22 47 T3_H .req d23 48 49.macro _nh_stride k0, k1, k2, k3 50 51 // Load next message stride 52 vld1.8 {T3}, [MESSAGE]! 53 54 // Load next key stride 55 vld1.32 {\k3}, [KEY]! 56 57 // Add message words to key words 58 vadd.u32 T0, T3, \k0 59 vadd.u32 T1, T3, \k1 60 vadd.u32 T2, T3, \k2 61 vadd.u32 T3, T3, \k3 62 63 // Multiply 32x32 => 64 and accumulate 64 vmlal.u32 PASS0_SUMS, T0_L, T0_H 65 vmlal.u32 PASS1_SUMS, T1_L, T1_H 66 vmlal.u32 PASS2_SUMS, T2_L, T2_H 67 vmlal.u32 PASS3_SUMS, T3_L, T3_H 68.endm 69 70/* 71 * void nh_neon(const u32 *key, const u8 *message, size_t message_len, 72 * u8 hash[NH_HASH_BYTES]) 73 * 74 * It's guaranteed that message_len % 16 == 0. 75 */ 76ENTRY(nh_neon) 77 78 vld1.32 {K0,K1}, [KEY]! 79 vmov.u64 PASS0_SUMS, #0 80 vmov.u64 PASS1_SUMS, #0 81 vld1.32 {K2}, [KEY]! 82 vmov.u64 PASS2_SUMS, #0 83 vmov.u64 PASS3_SUMS, #0 84 85 subs MESSAGE_LEN, MESSAGE_LEN, #64 86 blt .Lloop4_done 87.Lloop4: 88 _nh_stride K0, K1, K2, K3 89 _nh_stride K1, K2, K3, K0 90 _nh_stride K2, K3, K0, K1 91 _nh_stride K3, K0, K1, K2 92 subs MESSAGE_LEN, MESSAGE_LEN, #64 93 bge .Lloop4 94 95.Lloop4_done: 96 ands MESSAGE_LEN, MESSAGE_LEN, #63 97 beq .Ldone 98 _nh_stride K0, K1, K2, K3 99 100 subs MESSAGE_LEN, MESSAGE_LEN, #16 101 beq .Ldone 102 _nh_stride K1, K2, K3, K0 103 104 subs MESSAGE_LEN, MESSAGE_LEN, #16 105 beq .Ldone 106 _nh_stride K2, K3, K0, K1 107 108.Ldone: 109 // Sum the accumulators for each pass, then store the sums to 'hash' 110 vadd.u64 T0_L, PASS0_SUM_A, PASS0_SUM_B 111 vadd.u64 T0_H, PASS1_SUM_A, PASS1_SUM_B 112 vadd.u64 T1_L, PASS2_SUM_A, PASS2_SUM_B 113 vadd.u64 T1_H, PASS3_SUM_A, PASS3_SUM_B 114 vst1.8 {T0-T1}, [HASH] 115 bx lr 116ENDPROC(nh_neon)