cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

nh-neon-core.S (2338B)


      1/* SPDX-License-Identifier: GPL-2.0 */
      2/*
      3 * NH - ε-almost-universal hash function, NEON accelerated version
      4 *
      5 * Copyright 2018 Google LLC
      6 *
      7 * Author: Eric Biggers <ebiggers@google.com>
      8 */
      9
     10#include <linux/linkage.h>
     11
     12	.text
     13	.fpu		neon
     14
     15	KEY		.req	r0
     16	MESSAGE		.req	r1
     17	MESSAGE_LEN	.req	r2
     18	HASH		.req	r3
     19
     20	PASS0_SUMS	.req	q0
     21	PASS0_SUM_A	.req	d0
     22	PASS0_SUM_B	.req	d1
     23	PASS1_SUMS	.req	q1
     24	PASS1_SUM_A	.req	d2
     25	PASS1_SUM_B	.req	d3
     26	PASS2_SUMS	.req	q2
     27	PASS2_SUM_A	.req	d4
     28	PASS2_SUM_B	.req	d5
     29	PASS3_SUMS	.req	q3
     30	PASS3_SUM_A	.req	d6
     31	PASS3_SUM_B	.req	d7
     32	K0		.req	q4
     33	K1		.req	q5
     34	K2		.req	q6
     35	K3		.req	q7
     36	T0		.req	q8
     37	T0_L		.req	d16
     38	T0_H		.req	d17
     39	T1		.req	q9
     40	T1_L		.req	d18
     41	T1_H		.req	d19
     42	T2		.req	q10
     43	T2_L		.req	d20
     44	T2_H		.req	d21
     45	T3		.req	q11
     46	T3_L		.req	d22
     47	T3_H		.req	d23
     48
     49.macro _nh_stride	k0, k1, k2, k3
     50
     51	// Load next message stride
     52	vld1.8		{T3}, [MESSAGE]!
     53
     54	// Load next key stride
     55	vld1.32		{\k3}, [KEY]!
     56
     57	// Add message words to key words
     58	vadd.u32	T0, T3, \k0
     59	vadd.u32	T1, T3, \k1
     60	vadd.u32	T2, T3, \k2
     61	vadd.u32	T3, T3, \k3
     62
     63	// Multiply 32x32 => 64 and accumulate
     64	vmlal.u32	PASS0_SUMS, T0_L, T0_H
     65	vmlal.u32	PASS1_SUMS, T1_L, T1_H
     66	vmlal.u32	PASS2_SUMS, T2_L, T2_H
     67	vmlal.u32	PASS3_SUMS, T3_L, T3_H
     68.endm
     69
     70/*
     71 * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
     72 *		u8 hash[NH_HASH_BYTES])
     73 *
     74 * It's guaranteed that message_len % 16 == 0.
     75 */
     76ENTRY(nh_neon)
     77
     78	vld1.32		{K0,K1}, [KEY]!
     79	  vmov.u64	PASS0_SUMS, #0
     80	  vmov.u64	PASS1_SUMS, #0
     81	vld1.32		{K2}, [KEY]!
     82	  vmov.u64	PASS2_SUMS, #0
     83	  vmov.u64	PASS3_SUMS, #0
     84
     85	subs		MESSAGE_LEN, MESSAGE_LEN, #64
     86	blt		.Lloop4_done
     87.Lloop4:
     88	_nh_stride	K0, K1, K2, K3
     89	_nh_stride	K1, K2, K3, K0
     90	_nh_stride	K2, K3, K0, K1
     91	_nh_stride	K3, K0, K1, K2
     92	subs		MESSAGE_LEN, MESSAGE_LEN, #64
     93	bge		.Lloop4
     94
     95.Lloop4_done:
     96	ands		MESSAGE_LEN, MESSAGE_LEN, #63
     97	beq		.Ldone
     98	_nh_stride	K0, K1, K2, K3
     99
    100	subs		MESSAGE_LEN, MESSAGE_LEN, #16
    101	beq		.Ldone
    102	_nh_stride	K1, K2, K3, K0
    103
    104	subs		MESSAGE_LEN, MESSAGE_LEN, #16
    105	beq		.Ldone
    106	_nh_stride	K2, K3, K0, K1
    107
    108.Ldone:
    109	// Sum the accumulators for each pass, then store the sums to 'hash'
    110	vadd.u64	T0_L, PASS0_SUM_A, PASS0_SUM_B
    111	vadd.u64	T0_H, PASS1_SUM_A, PASS1_SUM_B
    112	vadd.u64	T1_L, PASS2_SUM_A, PASS2_SUM_B
    113	vadd.u64	T1_H, PASS3_SUM_A, PASS3_SUM_B
    114	vst1.8		{T0-T1}, [HASH]
    115	bx		lr
    116ENDPROC(nh_neon)