cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

aes_ctrby8_avx-x86_64.S (10199B)


      1/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
      2/*
      3 * AES CTR mode by8 optimization with AVX instructions. (x86_64)
      4 *
      5 * Copyright(c) 2014 Intel Corporation.
      6 *
      7 * Contact Information:
      8 * James Guilford <james.guilford@intel.com>
      9 * Sean Gulley <sean.m.gulley@intel.com>
     10 * Chandramouli Narayanan <mouli@linux.intel.com>
     11 */
     12/*
     13 * This is AES128/192/256 CTR mode optimization implementation. It requires
     14 * the support of Intel(R) AESNI and AVX instructions.
     15 *
     16 * This work was inspired by the AES CTR mode optimization published
     17 * in Intel Optimized IPSEC Cryptographic library.
     18 * Additional information on it can be found at:
     19 *    https://github.com/intel/intel-ipsec-mb
     20 */
     21
     22#include <linux/linkage.h>
     23
     24#define VMOVDQ		vmovdqu
     25
     26#define xdata0		%xmm0
     27#define xdata1		%xmm1
     28#define xdata2		%xmm2
     29#define xdata3		%xmm3
     30#define xdata4		%xmm4
     31#define xdata5		%xmm5
     32#define xdata6		%xmm6
     33#define xdata7		%xmm7
     34#define xcounter	%xmm8
     35#define xbyteswap	%xmm9
     36#define xkey0		%xmm10
     37#define xkey4		%xmm11
     38#define xkey8		%xmm12
     39#define xkey12		%xmm13
     40#define xkeyA		%xmm14
     41#define xkeyB		%xmm15
     42
     43#define p_in		%rdi
     44#define p_iv		%rsi
     45#define p_keys		%rdx
     46#define p_out		%rcx
     47#define num_bytes	%r8
     48
     49#define tmp		%r10
     50#define	DDQ_DATA	0
     51#define	XDATA		1
     52#define KEY_128		1
     53#define KEY_192		2
     54#define KEY_256		3
     55
     56.section .rodata
     57.align 16
     58
     59byteswap_const:
     60	.octa 0x000102030405060708090A0B0C0D0E0F
     61ddq_low_msk:
     62	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
     63ddq_high_add_1:
     64	.octa 0x00000000000000010000000000000000
     65ddq_add_1:
     66	.octa 0x00000000000000000000000000000001
     67ddq_add_2:
     68	.octa 0x00000000000000000000000000000002
     69ddq_add_3:
     70	.octa 0x00000000000000000000000000000003
     71ddq_add_4:
     72	.octa 0x00000000000000000000000000000004
     73ddq_add_5:
     74	.octa 0x00000000000000000000000000000005
     75ddq_add_6:
     76	.octa 0x00000000000000000000000000000006
     77ddq_add_7:
     78	.octa 0x00000000000000000000000000000007
     79ddq_add_8:
     80	.octa 0x00000000000000000000000000000008
     81
     82.text
     83
     84/* generate a unique variable for ddq_add_x */
     85
     86/* generate a unique variable for xmm register */
     87.macro setxdata n
     88	var_xdata = %xmm\n
     89.endm
     90
     91/* club the numeric 'id' to the symbol 'name' */
     92
     93.macro club name, id
     94.altmacro
     95	.if \name == XDATA
     96		setxdata %\id
     97	.endif
     98.noaltmacro
     99.endm
    100
    101/*
    102 * do_aes num_in_par load_keys key_len
    103 * This increments p_in, but not p_out
    104 */
    105.macro do_aes b, k, key_len
    106	.set by, \b
    107	.set load_keys, \k
    108	.set klen, \key_len
    109
    110	.if (load_keys)
    111		vmovdqa	0*16(p_keys), xkey0
    112	.endif
    113
    114	vpshufb	xbyteswap, xcounter, xdata0
    115
    116	.set i, 1
    117	.rept (by - 1)
    118		club XDATA, i
    119		vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
    120		vptest	ddq_low_msk(%rip), var_xdata
    121		jnz 1f
    122		vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
    123		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
    124		1:
    125		vpshufb	xbyteswap, var_xdata, var_xdata
    126		.set i, (i +1)
    127	.endr
    128
    129	vmovdqa	1*16(p_keys), xkeyA
    130
    131	vpxor	xkey0, xdata0, xdata0
    132	vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
    133	vptest	ddq_low_msk(%rip), xcounter
    134	jnz	1f
    135	vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
    136	1:
    137
    138	.set i, 1
    139	.rept (by - 1)
    140		club XDATA, i
    141		vpxor	xkey0, var_xdata, var_xdata
    142		.set i, (i +1)
    143	.endr
    144
    145	vmovdqa	2*16(p_keys), xkeyB
    146
    147	.set i, 0
    148	.rept by
    149		club XDATA, i
    150		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
    151		.set i, (i +1)
    152	.endr
    153
    154	.if (klen == KEY_128)
    155		.if (load_keys)
    156			vmovdqa	3*16(p_keys), xkey4
    157		.endif
    158	.else
    159		vmovdqa	3*16(p_keys), xkeyA
    160	.endif
    161
    162	.set i, 0
    163	.rept by
    164		club XDATA, i
    165		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
    166		.set i, (i +1)
    167	.endr
    168
    169	add	$(16*by), p_in
    170
    171	.if (klen == KEY_128)
    172		vmovdqa	4*16(p_keys), xkeyB
    173	.else
    174		.if (load_keys)
    175			vmovdqa	4*16(p_keys), xkey4
    176		.endif
    177	.endif
    178
    179	.set i, 0
    180	.rept by
    181		club XDATA, i
    182		/* key 3 */
    183		.if (klen == KEY_128)
    184			vaesenc	xkey4, var_xdata, var_xdata
    185		.else
    186			vaesenc	xkeyA, var_xdata, var_xdata
    187		.endif
    188		.set i, (i +1)
    189	.endr
    190
    191	vmovdqa	5*16(p_keys), xkeyA
    192
    193	.set i, 0
    194	.rept by
    195		club XDATA, i
    196		/* key 4 */
    197		.if (klen == KEY_128)
    198			vaesenc	xkeyB, var_xdata, var_xdata
    199		.else
    200			vaesenc	xkey4, var_xdata, var_xdata
    201		.endif
    202		.set i, (i +1)
    203	.endr
    204
    205	.if (klen == KEY_128)
    206		.if (load_keys)
    207			vmovdqa	6*16(p_keys), xkey8
    208		.endif
    209	.else
    210		vmovdqa	6*16(p_keys), xkeyB
    211	.endif
    212
    213	.set i, 0
    214	.rept by
    215		club XDATA, i
    216		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
    217		.set i, (i +1)
    218	.endr
    219
    220	vmovdqa	7*16(p_keys), xkeyA
    221
    222	.set i, 0
    223	.rept by
    224		club XDATA, i
    225		/* key 6 */
    226		.if (klen == KEY_128)
    227			vaesenc	xkey8, var_xdata, var_xdata
    228		.else
    229			vaesenc	xkeyB, var_xdata, var_xdata
    230		.endif
    231		.set i, (i +1)
    232	.endr
    233
    234	.if (klen == KEY_128)
    235		vmovdqa	8*16(p_keys), xkeyB
    236	.else
    237		.if (load_keys)
    238			vmovdqa	8*16(p_keys), xkey8
    239		.endif
    240	.endif
    241
    242	.set i, 0
    243	.rept by
    244		club XDATA, i
    245		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
    246		.set i, (i +1)
    247	.endr
    248
    249	.if (klen == KEY_128)
    250		.if (load_keys)
    251			vmovdqa	9*16(p_keys), xkey12
    252		.endif
    253	.else
    254		vmovdqa	9*16(p_keys), xkeyA
    255	.endif
    256
    257	.set i, 0
    258	.rept by
    259		club XDATA, i
    260		/* key 8 */
    261		.if (klen == KEY_128)
    262			vaesenc	xkeyB, var_xdata, var_xdata
    263		.else
    264			vaesenc	xkey8, var_xdata, var_xdata
    265		.endif
    266		.set i, (i +1)
    267	.endr
    268
    269	vmovdqa	10*16(p_keys), xkeyB
    270
    271	.set i, 0
    272	.rept by
    273		club XDATA, i
    274		/* key 9 */
    275		.if (klen == KEY_128)
    276			vaesenc	xkey12, var_xdata, var_xdata
    277		.else
    278			vaesenc	xkeyA, var_xdata, var_xdata
    279		.endif
    280		.set i, (i +1)
    281	.endr
    282
    283	.if (klen != KEY_128)
    284		vmovdqa	11*16(p_keys), xkeyA
    285	.endif
    286
    287	.set i, 0
    288	.rept by
    289		club XDATA, i
    290		/* key 10 */
    291		.if (klen == KEY_128)
    292			vaesenclast	xkeyB, var_xdata, var_xdata
    293		.else
    294			vaesenc	xkeyB, var_xdata, var_xdata
    295		.endif
    296		.set i, (i +1)
    297	.endr
    298
    299	.if (klen != KEY_128)
    300		.if (load_keys)
    301			vmovdqa	12*16(p_keys), xkey12
    302		.endif
    303
    304		.set i, 0
    305		.rept by
    306			club XDATA, i
    307			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
    308			.set i, (i +1)
    309		.endr
    310
    311		.if (klen == KEY_256)
    312			vmovdqa	13*16(p_keys), xkeyA
    313		.endif
    314
    315		.set i, 0
    316		.rept by
    317			club XDATA, i
    318			.if (klen == KEY_256)
    319				/* key 12 */
    320				vaesenc	xkey12, var_xdata, var_xdata
    321			.else
    322				vaesenclast xkey12, var_xdata, var_xdata
    323			.endif
    324			.set i, (i +1)
    325		.endr
    326
    327		.if (klen == KEY_256)
    328			vmovdqa	14*16(p_keys), xkeyB
    329
    330			.set i, 0
    331			.rept by
    332				club XDATA, i
    333				/* key 13 */
    334				vaesenc	xkeyA, var_xdata, var_xdata
    335				.set i, (i +1)
    336			.endr
    337
    338			.set i, 0
    339			.rept by
    340				club XDATA, i
    341				/* key 14 */
    342				vaesenclast	xkeyB, var_xdata, var_xdata
    343				.set i, (i +1)
    344			.endr
    345		.endif
    346	.endif
    347
    348	.set i, 0
    349	.rept (by / 2)
    350		.set j, (i+1)
    351		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
    352		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
    353		club XDATA, i
    354		vpxor	xkeyA, var_xdata, var_xdata
    355		club XDATA, j
    356		vpxor	xkeyB, var_xdata, var_xdata
    357		.set i, (i+2)
    358	.endr
    359
    360	.if (i < by)
    361		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
    362		club XDATA, i
    363		vpxor	xkeyA, var_xdata, var_xdata
    364	.endif
    365
    366	.set i, 0
    367	.rept by
    368		club XDATA, i
    369		VMOVDQ	var_xdata, i*16(p_out)
    370		.set i, (i+1)
    371	.endr
    372.endm
    373
    374.macro do_aes_load val, key_len
    375	do_aes \val, 1, \key_len
    376.endm
    377
    378.macro do_aes_noload val, key_len
    379	do_aes \val, 0, \key_len
    380.endm
    381
    382/* main body of aes ctr load */
    383
    384.macro do_aes_ctrmain key_len
    385	cmp	$16, num_bytes
    386	jb	.Ldo_return2\key_len
    387
    388	vmovdqa	byteswap_const(%rip), xbyteswap
    389	vmovdqu	(p_iv), xcounter
    390	vpshufb	xbyteswap, xcounter, xcounter
    391
    392	mov	num_bytes, tmp
    393	and	$(7*16), tmp
    394	jz	.Lmult_of_8_blks\key_len
    395
    396	/* 1 <= tmp <= 7 */
    397	cmp	$(4*16), tmp
    398	jg	.Lgt4\key_len
    399	je	.Leq4\key_len
    400
    401.Llt4\key_len:
    402	cmp	$(2*16), tmp
    403	jg	.Leq3\key_len
    404	je	.Leq2\key_len
    405
    406.Leq1\key_len:
    407	do_aes_load	1, \key_len
    408	add	$(1*16), p_out
    409	and	$(~7*16), num_bytes
    410	jz	.Ldo_return2\key_len
    411	jmp	.Lmain_loop2\key_len
    412
    413.Leq2\key_len:
    414	do_aes_load	2, \key_len
    415	add	$(2*16), p_out
    416	and	$(~7*16), num_bytes
    417	jz	.Ldo_return2\key_len
    418	jmp	.Lmain_loop2\key_len
    419
    420
    421.Leq3\key_len:
    422	do_aes_load	3, \key_len
    423	add	$(3*16), p_out
    424	and	$(~7*16), num_bytes
    425	jz	.Ldo_return2\key_len
    426	jmp	.Lmain_loop2\key_len
    427
    428.Leq4\key_len:
    429	do_aes_load	4, \key_len
    430	add	$(4*16), p_out
    431	and	$(~7*16), num_bytes
    432	jz	.Ldo_return2\key_len
    433	jmp	.Lmain_loop2\key_len
    434
    435.Lgt4\key_len:
    436	cmp	$(6*16), tmp
    437	jg	.Leq7\key_len
    438	je	.Leq6\key_len
    439
    440.Leq5\key_len:
    441	do_aes_load	5, \key_len
    442	add	$(5*16), p_out
    443	and	$(~7*16), num_bytes
    444	jz	.Ldo_return2\key_len
    445	jmp	.Lmain_loop2\key_len
    446
    447.Leq6\key_len:
    448	do_aes_load	6, \key_len
    449	add	$(6*16), p_out
    450	and	$(~7*16), num_bytes
    451	jz	.Ldo_return2\key_len
    452	jmp	.Lmain_loop2\key_len
    453
    454.Leq7\key_len:
    455	do_aes_load	7, \key_len
    456	add	$(7*16), p_out
    457	and	$(~7*16), num_bytes
    458	jz	.Ldo_return2\key_len
    459	jmp	.Lmain_loop2\key_len
    460
    461.Lmult_of_8_blks\key_len:
    462	.if (\key_len != KEY_128)
    463		vmovdqa	0*16(p_keys), xkey0
    464		vmovdqa	4*16(p_keys), xkey4
    465		vmovdqa	8*16(p_keys), xkey8
    466		vmovdqa	12*16(p_keys), xkey12
    467	.else
    468		vmovdqa	0*16(p_keys), xkey0
    469		vmovdqa	3*16(p_keys), xkey4
    470		vmovdqa	6*16(p_keys), xkey8
    471		vmovdqa	9*16(p_keys), xkey12
    472	.endif
    473.align 16
    474.Lmain_loop2\key_len:
    475	/* num_bytes is a multiple of 8 and >0 */
    476	do_aes_noload	8, \key_len
    477	add	$(8*16), p_out
    478	sub	$(8*16), num_bytes
    479	jne	.Lmain_loop2\key_len
    480
    481.Ldo_return2\key_len:
    482	/* return updated IV */
    483	vpshufb	xbyteswap, xcounter, xcounter
    484	vmovdqu	xcounter, (p_iv)
    485	RET
    486.endm
    487
    488/*
    489 * routine to do AES128 CTR enc/decrypt "by8"
    490 * XMM registers are clobbered.
    491 * Saving/restoring must be done at a higher level
    492 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
    493 *			unsigned int num_bytes)
    494 */
    495SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
    496	/* call the aes main loop */
    497	do_aes_ctrmain KEY_128
    498
    499SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
    500
    501/*
    502 * routine to do AES192 CTR enc/decrypt "by8"
    503 * XMM registers are clobbered.
    504 * Saving/restoring must be done at a higher level
    505 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
    506 *			unsigned int num_bytes)
    507 */
    508SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
    509	/* call the aes main loop */
    510	do_aes_ctrmain KEY_192
    511
    512SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
    513
    514/*
    515 * routine to do AES256 CTR enc/decrypt "by8"
    516 * XMM registers are clobbered.
    517 * Saving/restoring must be done at a higher level
    518 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
    519 *			unsigned int num_bytes)
    520 */
    521SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
    522	/* call the aes main loop */
    523	do_aes_ctrmain KEY_256
    524
    525SYM_FUNC_END(aes_ctr_enc_256_avx_by8)