ghash-ce-core.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
ghash-ce-core.S (6891B)
      1/* SPDX-License-Identifier: GPL-2.0-only */
      2/*
      3 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
      4 *
      5 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
      6 */
      7
      8#include <linux/linkage.h>
      9#include <asm/assembler.h>
     10
     11	.arch		armv8-a
     12	.fpu		crypto-neon-fp-armv8
     13
     14	SHASH		.req	q0
     15	T1		.req	q1
     16	XL		.req	q2
     17	XM		.req	q3
     18	XH		.req	q4
     19	IN1		.req	q4
     20
     21	SHASH_L		.req	d0
     22	SHASH_H		.req	d1
     23	T1_L		.req	d2
     24	T1_H		.req	d3
     25	XL_L		.req	d4
     26	XL_H		.req	d5
     27	XM_L		.req	d6
     28	XM_H		.req	d7
     29	XH_L		.req	d8
     30
     31	t0l		.req	d10
     32	t0h		.req	d11
     33	t1l		.req	d12
     34	t1h		.req	d13
     35	t2l		.req	d14
     36	t2h		.req	d15
     37	t3l		.req	d16
     38	t3h		.req	d17
     39	t4l		.req	d18
     40	t4h		.req	d19
     41
     42	t0q		.req	q5
     43	t1q		.req	q6
     44	t2q		.req	q7
     45	t3q		.req	q8
     46	t4q		.req	q9
     47	T2		.req	q9
     48
     49	s1l		.req	d20
     50	s1h		.req	d21
     51	s2l		.req	d22
     52	s2h		.req	d23
     53	s3l		.req	d24
     54	s3h		.req	d25
     55	s4l		.req	d26
     56	s4h		.req	d27
     57
     58	MASK		.req	d28
     59	SHASH2_p8	.req	d28
     60
     61	k16		.req	d29
     62	k32		.req	d30
     63	k48		.req	d31
     64	SHASH2_p64	.req	d31
     65
     66	HH		.req	q10
     67	HH3		.req	q11
     68	HH4		.req	q12
     69	HH34		.req	q13
     70
     71	HH_L		.req	d20
     72	HH_H		.req	d21
     73	HH3_L		.req	d22
     74	HH3_H		.req	d23
     75	HH4_L		.req	d24
     76	HH4_H		.req	d25
     77	HH34_L		.req	d26
     78	HH34_H		.req	d27
     79	SHASH2_H	.req	d29
     80
     81	XL2		.req	q5
     82	XM2		.req	q6
     83	XH2		.req	q7
     84	T3		.req	q8
     85
     86	XL2_L		.req	d10
     87	XL2_H		.req	d11
     88	XM2_L		.req	d12
     89	XM2_H		.req	d13
     90	T3_L		.req	d16
     91	T3_H		.req	d17
     92
     93	.text
     94
     95	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
     96	vmull.p64	\rd, \rn, \rm
     97	.endm
     98
     99	/*
    100	 * This implementation of 64x64 -> 128 bit polynomial multiplication
    101	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
    102	 * "Fast Software Polynomial Multiplication on ARM Processors Using
    103	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
    104	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
    105	 *
    106	 * It has been slightly tweaked for in-order performance, and to allow
    107	 * 'rq' to overlap with 'ad' or 'bd'.
    108	 */
    109	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
    110	vext.8		t0l, \ad, \ad, #1	@ A1
    111	.ifc		\b1, t4l
    112	vext.8		t4l, \bd, \bd, #1	@ B1
    113	.endif
    114	vmull.p8	t0q, t0l, \bd		@ F = A1*B
    115	vext.8		t1l, \ad, \ad, #2	@ A2
    116	vmull.p8	t4q, \ad, \b1		@ E = A*B1
    117	.ifc		\b2, t3l
    118	vext.8		t3l, \bd, \bd, #2	@ B2
    119	.endif
    120	vmull.p8	t1q, t1l, \bd		@ H = A2*B
    121	vext.8		t2l, \ad, \ad, #3	@ A3
    122	vmull.p8	t3q, \ad, \b2		@ G = A*B2
    123	veor		t0q, t0q, t4q		@ L = E + F
    124	.ifc		\b3, t4l
    125	vext.8		t4l, \bd, \bd, #3	@ B3
    126	.endif
    127	vmull.p8	t2q, t2l, \bd		@ J = A3*B
    128	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
    129	veor		t1q, t1q, t3q		@ M = G + H
    130	.ifc		\b4, t3l
    131	vext.8		t3l, \bd, \bd, #4	@ B4
    132	.endif
    133	vmull.p8	t4q, \ad, \b3		@ I = A*B3
    134	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
    135	vmull.p8	t3q, \ad, \b4		@ K = A*B4
    136	vand		t0h, t0h, k48
    137	vand		t1h, t1h, k32
    138	veor		t2q, t2q, t4q		@ N = I + J
    139	veor		t0l, t0l, t0h
    140	veor		t1l, t1l, t1h
    141	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
    142	vand		t2h, t2h, k16
    143	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
    144	vmov.i64	t3h, #0
    145	vext.8		t0q, t0q, t0q, #15
    146	veor		t2l, t2l, t2h
    147	vext.8		t1q, t1q, t1q, #14
    148	vmull.p8	\rq, \ad, \bd		@ D = A*B
    149	vext.8		t2q, t2q, t2q, #13
    150	vext.8		t3q, t3q, t3q, #12
    151	veor		t0q, t0q, t1q
    152	veor		t2q, t2q, t3q
    153	veor		\rq, \rq, t0q
    154	veor		\rq, \rq, t2q
    155	.endm
    156
    157	//
    158	// PMULL (64x64->128) based reduction for CPUs that can do
    159	// it in a single instruction.
    160	//
    161	.macro		__pmull_reduce_p64
    162	vmull.p64	T1, XL_L, MASK
    163
    164	veor		XH_L, XH_L, XM_H
    165	vext.8		T1, T1, T1, #8
    166	veor		XL_H, XL_H, XM_L
    167	veor		T1, T1, XL
    168
    169	vmull.p64	XL, T1_H, MASK
    170	.endm
    171
    172	//
    173	// Alternative reduction for CPUs that lack support for the
    174	// 64x64->128 PMULL instruction
    175	//
    176	.macro		__pmull_reduce_p8
    177	veor		XL_H, XL_H, XM_L
    178	veor		XH_L, XH_L, XM_H
    179
    180	vshl.i64	T1, XL, #57
    181	vshl.i64	T2, XL, #62
    182	veor		T1, T1, T2
    183	vshl.i64	T2, XL, #63
    184	veor		T1, T1, T2
    185	veor		XL_H, XL_H, T1_L
    186	veor		XH_L, XH_L, T1_H
    187
    188	vshr.u64	T1, XL, #1
    189	veor		XH, XH, XL
    190	veor		XL, XL, T1
    191	vshr.u64	T1, T1, #6
    192	vshr.u64	XL, XL, #1
    193	.endm
    194
    195	.macro		ghash_update, pn
    196	vld1.64		{XL}, [r1]
    197
    198	/* do the head block first, if supplied */
    199	ldr		ip, [sp]
    200	teq		ip, #0
    201	beq		0f
    202	vld1.64		{T1}, [ip]
    203	teq		r0, #0
    204	b		3f
    205
    2060:	.ifc		\pn, p64
    207	tst		r0, #3			// skip until #blocks is a
    208	bne		2f			// round multiple of 4
    209
    210	vld1.8		{XL2-XM2}, [r2]!
    2111:	vld1.8		{T3-T2}, [r2]!
    212	vrev64.8	XL2, XL2
    213	vrev64.8	XM2, XM2
    214
    215	subs		r0, r0, #4
    216
    217	vext.8		T1, XL2, XL2, #8
    218	veor		XL2_H, XL2_H, XL_L
    219	veor		XL, XL, T1
    220
    221	vrev64.8	T3, T3
    222	vrev64.8	T1, T2
    223
    224	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
    225	veor		XL2_H, XL2_H, XL_H
    226	vmull.p64	XL, HH4_L, XL_L			// a0 * b0
    227	vmull.p64	XM, HH34_H, XL2_H		// (a1 + a0)(b1 + b0)
    228
    229	vmull.p64	XH2, HH3_H, XM2_L		// a1 * b1
    230	veor		XM2_L, XM2_L, XM2_H
    231	vmull.p64	XL2, HH3_L, XM2_H		// a0 * b0
    232	vmull.p64	XM2, HH34_L, XM2_L		// (a1 + a0)(b1 + b0)
    233
    234	veor		XH, XH, XH2
    235	veor		XL, XL, XL2
    236	veor		XM, XM, XM2
    237
    238	vmull.p64	XH2, HH_H, T3_L			// a1 * b1
    239	veor		T3_L, T3_L, T3_H
    240	vmull.p64	XL2, HH_L, T3_H			// a0 * b0
    241	vmull.p64	XM2, SHASH2_H, T3_L		// (a1 + a0)(b1 + b0)
    242
    243	veor		XH, XH, XH2
    244	veor		XL, XL, XL2
    245	veor		XM, XM, XM2
    246
    247	vmull.p64	XH2, SHASH_H, T1_L		// a1 * b1
    248	veor		T1_L, T1_L, T1_H
    249	vmull.p64	XL2, SHASH_L, T1_H		// a0 * b0
    250	vmull.p64	XM2, SHASH2_p64, T1_L		// (a1 + a0)(b1 + b0)
    251
    252	veor		XH, XH, XH2
    253	veor		XL, XL, XL2
    254	veor		XM, XM, XM2
    255
    256	beq		4f
    257
    258	vld1.8		{XL2-XM2}, [r2]!
    259
    260	veor		T1, XL, XH
    261	veor		XM, XM, T1
    262
    263	__pmull_reduce_p64
    264
    265	veor		T1, T1, XH
    266	veor		XL, XL, T1
    267
    268	b		1b
    269	.endif
    270
    2712:	vld1.64		{T1}, [r2]!
    272	subs		r0, r0, #1
    273
    2743:	/* multiply XL by SHASH in GF(2^128) */
    275#ifndef CONFIG_CPU_BIG_ENDIAN
    276	vrev64.8	T1, T1
    277#endif
    278	vext.8		IN1, T1, T1, #8
    279	veor		T1_L, T1_L, XL_H
    280	veor		XL, XL, IN1
    281
    282	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
    283	veor		T1, T1, XL
    284	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
    285	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
    286
    2874:	veor		T1, XL, XH
    288	veor		XM, XM, T1
    289
    290	__pmull_reduce_\pn
    291
    292	veor		T1, T1, XH
    293	veor		XL, XL, T1
    294
    295	bne		0b
    296
    297	vst1.64		{XL}, [r1]
    298	bx		lr
    299	.endm
    300
    301	/*
    302	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
    303	 *			   struct ghash_key const *k, const char *head)
    304	 */
    305ENTRY(pmull_ghash_update_p64)
    306	vld1.64		{SHASH}, [r3]!
    307	vld1.64		{HH}, [r3]!
    308	vld1.64		{HH3-HH4}, [r3]
    309
    310	veor		SHASH2_p64, SHASH_L, SHASH_H
    311	veor		SHASH2_H, HH_L, HH_H
    312	veor		HH34_L, HH3_L, HH3_H
    313	veor		HH34_H, HH4_L, HH4_H
    314
    315	vmov.i8		MASK, #0xe1
    316	vshl.u64	MASK, MASK, #57
    317
    318	ghash_update	p64
    319ENDPROC(pmull_ghash_update_p64)
    320
    321ENTRY(pmull_ghash_update_p8)
    322	vld1.64		{SHASH}, [r3]
    323	veor		SHASH2_p8, SHASH_L, SHASH_H
    324
    325	vext.8		s1l, SHASH_L, SHASH_L, #1
    326	vext.8		s2l, SHASH_L, SHASH_L, #2
    327	vext.8		s3l, SHASH_L, SHASH_L, #3
    328	vext.8		s4l, SHASH_L, SHASH_L, #4
    329	vext.8		s1h, SHASH_H, SHASH_H, #1
    330	vext.8		s2h, SHASH_H, SHASH_H, #2
    331	vext.8		s3h, SHASH_H, SHASH_H, #3
    332	vext.8		s4h, SHASH_H, SHASH_H, #4
    333
    334	vmov.i64	k16, #0xffff
    335	vmov.i64	k32, #0xffffffff
    336	vmov.i64	k48, #0xffffffffffff
    337
    338	ghash_update	p8
    339ENDPROC(pmull_ghash_update_p8)