nh-sse2-x86_64.S (2742B)
1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated 4 * 5 * Copyright 2018 Google LLC 6 * 7 * Author: Eric Biggers <ebiggers@google.com> 8 */ 9 10#include <linux/linkage.h> 11 12#define PASS0_SUMS %xmm0 13#define PASS1_SUMS %xmm1 14#define PASS2_SUMS %xmm2 15#define PASS3_SUMS %xmm3 16#define K0 %xmm4 17#define K1 %xmm5 18#define K2 %xmm6 19#define K3 %xmm7 20#define T0 %xmm8 21#define T1 %xmm9 22#define T2 %xmm10 23#define T3 %xmm11 24#define T4 %xmm12 25#define T5 %xmm13 26#define T6 %xmm14 27#define T7 %xmm15 28#define KEY %rdi 29#define MESSAGE %rsi 30#define MESSAGE_LEN %rdx 31#define HASH %rcx 32 33.macro _nh_stride k0, k1, k2, k3, offset 34 35 // Load next message stride 36 movdqu \offset(MESSAGE), T1 37 38 // Load next key stride 39 movdqu \offset(KEY), \k3 40 41 // Add message words to key words 42 movdqa T1, T2 43 movdqa T1, T3 44 paddd T1, \k0 // reuse k0 to avoid a move 45 paddd \k1, T1 46 paddd \k2, T2 47 paddd \k3, T3 48 49 // Multiply 32x32 => 64 and accumulate 50 pshufd $0x10, \k0, T4 51 pshufd $0x32, \k0, \k0 52 pshufd $0x10, T1, T5 53 pshufd $0x32, T1, T1 54 pshufd $0x10, T2, T6 55 pshufd $0x32, T2, T2 56 pshufd $0x10, T3, T7 57 pshufd $0x32, T3, T3 58 pmuludq T4, \k0 59 pmuludq T5, T1 60 pmuludq T6, T2 61 pmuludq T7, T3 62 paddq \k0, PASS0_SUMS 63 paddq T1, PASS1_SUMS 64 paddq T2, PASS2_SUMS 65 paddq T3, PASS3_SUMS 66.endm 67 68/* 69 * void nh_sse2(const u32 *key, const u8 *message, size_t message_len, 70 * u8 hash[NH_HASH_BYTES]) 71 * 72 * It's guaranteed that message_len % 16 == 0. 73 */ 74SYM_FUNC_START(nh_sse2) 75 76 movdqu 0x00(KEY), K0 77 movdqu 0x10(KEY), K1 78 movdqu 0x20(KEY), K2 79 add $0x30, KEY 80 pxor PASS0_SUMS, PASS0_SUMS 81 pxor PASS1_SUMS, PASS1_SUMS 82 pxor PASS2_SUMS, PASS2_SUMS 83 pxor PASS3_SUMS, PASS3_SUMS 84 85 sub $0x40, MESSAGE_LEN 86 jl .Lloop4_done 87.Lloop4: 88 _nh_stride K0, K1, K2, K3, 0x00 89 _nh_stride K1, K2, K3, K0, 0x10 90 _nh_stride K2, K3, K0, K1, 0x20 91 _nh_stride K3, K0, K1, K2, 0x30 92 add $0x40, KEY 93 add $0x40, MESSAGE 94 sub $0x40, MESSAGE_LEN 95 jge .Lloop4 96 97.Lloop4_done: 98 and $0x3f, MESSAGE_LEN 99 jz .Ldone 100 _nh_stride K0, K1, K2, K3, 0x00 101 102 sub $0x10, MESSAGE_LEN 103 jz .Ldone 104 _nh_stride K1, K2, K3, K0, 0x10 105 106 sub $0x10, MESSAGE_LEN 107 jz .Ldone 108 _nh_stride K2, K3, K0, K1, 0x20 109 110.Ldone: 111 // Sum the accumulators for each pass, then store the sums to 'hash' 112 movdqa PASS0_SUMS, T0 113 movdqa PASS2_SUMS, T1 114 punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A) 115 punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A) 116 punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B) 117 punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B) 118 paddq PASS0_SUMS, T0 119 paddq PASS2_SUMS, T1 120 movdqu T0, 0x00(HASH) 121 movdqu T1, 0x10(HASH) 122 RET 123SYM_FUNC_END(nh_sse2)