nh-neon-core.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
nh-neon-core.S (2174B)
      1/* SPDX-License-Identifier: GPL-2.0 */
      2/*
      3 * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
      4 *
      5 * Copyright 2018 Google LLC
      6 *
      7 * Author: Eric Biggers <ebiggers@google.com>
      8 */
      9
     10#include <linux/linkage.h>
     11
     12	KEY		.req	x0
     13	MESSAGE		.req	x1
     14	MESSAGE_LEN	.req	x2
     15	HASH		.req	x3
     16
     17	PASS0_SUMS	.req	v0
     18	PASS1_SUMS	.req	v1
     19	PASS2_SUMS	.req	v2
     20	PASS3_SUMS	.req	v3
     21	K0		.req	v4
     22	K1		.req	v5
     23	K2		.req	v6
     24	K3		.req	v7
     25	T0		.req	v8
     26	T1		.req	v9
     27	T2		.req	v10
     28	T3		.req	v11
     29	T4		.req	v12
     30	T5		.req	v13
     31	T6		.req	v14
     32	T7		.req	v15
     33
     34.macro _nh_stride	k0, k1, k2, k3
     35
     36	// Load next message stride
     37	ld1		{T3.16b}, [MESSAGE], #16
     38
     39	// Load next key stride
     40	ld1		{\k3\().4s}, [KEY], #16
     41
     42	// Add message words to key words
     43	add		T0.4s, T3.4s, \k0\().4s
     44	add		T1.4s, T3.4s, \k1\().4s
     45	add		T2.4s, T3.4s, \k2\().4s
     46	add		T3.4s, T3.4s, \k3\().4s
     47
     48	// Multiply 32x32 => 64 and accumulate
     49	mov		T4.d[0], T0.d[1]
     50	mov		T5.d[0], T1.d[1]
     51	mov		T6.d[0], T2.d[1]
     52	mov		T7.d[0], T3.d[1]
     53	umlal		PASS0_SUMS.2d, T0.2s, T4.2s
     54	umlal		PASS1_SUMS.2d, T1.2s, T5.2s
     55	umlal		PASS2_SUMS.2d, T2.2s, T6.2s
     56	umlal		PASS3_SUMS.2d, T3.2s, T7.2s
     57.endm
     58
     59/*
     60 * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
     61 *		u8 hash[NH_HASH_BYTES])
     62 *
     63 * It's guaranteed that message_len % 16 == 0.
     64 */
     65SYM_FUNC_START(nh_neon)
     66
     67	ld1		{K0.4s,K1.4s}, [KEY], #32
     68	  movi		PASS0_SUMS.2d, #0
     69	  movi		PASS1_SUMS.2d, #0
     70	ld1		{K2.4s}, [KEY], #16
     71	  movi		PASS2_SUMS.2d, #0
     72	  movi		PASS3_SUMS.2d, #0
     73
     74	subs		MESSAGE_LEN, MESSAGE_LEN, #64
     75	blt		.Lloop4_done
     76.Lloop4:
     77	_nh_stride	K0, K1, K2, K3
     78	_nh_stride	K1, K2, K3, K0
     79	_nh_stride	K2, K3, K0, K1
     80	_nh_stride	K3, K0, K1, K2
     81	subs		MESSAGE_LEN, MESSAGE_LEN, #64
     82	bge		.Lloop4
     83
     84.Lloop4_done:
     85	ands		MESSAGE_LEN, MESSAGE_LEN, #63
     86	beq		.Ldone
     87	_nh_stride	K0, K1, K2, K3
     88
     89	subs		MESSAGE_LEN, MESSAGE_LEN, #16
     90	beq		.Ldone
     91	_nh_stride	K1, K2, K3, K0
     92
     93	subs		MESSAGE_LEN, MESSAGE_LEN, #16
     94	beq		.Ldone
     95	_nh_stride	K2, K3, K0, K1
     96
     97.Ldone:
     98	// Sum the accumulators for each pass, then store the sums to 'hash'
     99	addp		T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
    100	addp		T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
    101	st1		{T0.16b,T1.16b}, [HASH]
    102	ret
    103SYM_FUNC_END(nh_neon)