cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sm4-neon-core.S (15473B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * SM4 Cipher Algorithm for ARMv8 NEON
      4 * as specified in
      5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
      6 *
      7 * Copyright (C) 2022, Alibaba Group.
      8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
      9 */
     10
     11#include <linux/linkage.h>
     12#include <asm/assembler.h>
     13
     14/* Register macros */
     15
     16#define RTMP0	v8
     17#define RTMP1	v9
     18#define RTMP2	v10
     19#define RTMP3	v11
     20
     21#define RX0	v12
     22#define RX1	v13
     23#define RKEY	v14
     24#define RIV	v15
     25
     26/* Helper macros. */
     27
     28#define PREPARE                                                 \
     29	adr_l		x5, crypto_sm4_sbox;                    \
     30	ld1		{v16.16b-v19.16b}, [x5], #64;           \
     31	ld1		{v20.16b-v23.16b}, [x5], #64;           \
     32	ld1		{v24.16b-v27.16b}, [x5], #64;           \
     33	ld1		{v28.16b-v31.16b}, [x5];
     34
     35#define transpose_4x4(s0, s1, s2, s3)                           \
     36	zip1		RTMP0.4s, s0.4s, s1.4s;                 \
     37	zip1		RTMP1.4s, s2.4s, s3.4s;                 \
     38	zip2		RTMP2.4s, s0.4s, s1.4s;                 \
     39	zip2		RTMP3.4s, s2.4s, s3.4s;                 \
     40	zip1		s0.2d, RTMP0.2d, RTMP1.2d;              \
     41	zip2		s1.2d, RTMP0.2d, RTMP1.2d;              \
     42	zip1		s2.2d, RTMP2.2d, RTMP3.2d;              \
     43	zip2		s3.2d, RTMP2.2d, RTMP3.2d;
     44
     45#define rotate_clockwise_90(s0, s1, s2, s3)                     \
     46	zip1		RTMP0.4s, s1.4s, s0.4s;                 \
     47	zip2		RTMP1.4s, s1.4s, s0.4s;                 \
     48	zip1		RTMP2.4s, s3.4s, s2.4s;                 \
     49	zip2		RTMP3.4s, s3.4s, s2.4s;                 \
     50	zip1		s0.2d, RTMP2.2d, RTMP0.2d;              \
     51	zip2		s1.2d, RTMP2.2d, RTMP0.2d;              \
     52	zip1		s2.2d, RTMP3.2d, RTMP1.2d;              \
     53	zip2		s3.2d, RTMP3.2d, RTMP1.2d;
     54
     55#define ROUND4(round, s0, s1, s2, s3)                           \
     56	dup		RX0.4s, RKEY.s[round];                  \
     57	/* rk ^ s1 ^ s2 ^ s3 */                                 \
     58	eor		RTMP1.16b, s2.16b, s3.16b;              \
     59	eor		RX0.16b, RX0.16b, s1.16b;               \
     60	eor		RX0.16b, RX0.16b, RTMP1.16b;            \
     61                                                                \
     62	/* sbox, non-linear part */                             \
     63	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
     64	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
     65	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
     66	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
     67	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
     68	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
     69	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
     70	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
     71                                                                \
     72	/* linear part */                                       \
     73	shl		RTMP1.4s, RTMP0.4s, #8;                 \
     74	shl		RTMP2.4s, RTMP0.4s, #16;                \
     75	shl		RTMP3.4s, RTMP0.4s, #24;                \
     76	sri		RTMP1.4s, RTMP0.4s, #(32-8);            \
     77	sri		RTMP2.4s, RTMP0.4s, #(32-16);           \
     78	sri		RTMP3.4s, RTMP0.4s, #(32-24);           \
     79	/* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */            \
     80	eor		RTMP1.16b, RTMP1.16b, RTMP0.16b;        \
     81	eor		RTMP1.16b, RTMP1.16b, RTMP2.16b;        \
     82	/* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */        \
     83	eor		RTMP3.16b, RTMP3.16b, RTMP0.16b;        \
     84	shl		RTMP2.4s, RTMP1.4s, 2;                  \
     85	sri		RTMP2.4s, RTMP1.4s, #(32-2);            \
     86	eor		RTMP3.16b, RTMP3.16b, RTMP2.16b;        \
     87	/* s0 ^= RTMP3 */                                       \
     88	eor		s0.16b, s0.16b, RTMP3.16b;
     89
     90#define SM4_CRYPT_BLK4(b0, b1, b2, b3)                          \
     91	rev32		b0.16b, b0.16b;                         \
     92	rev32		b1.16b, b1.16b;                         \
     93	rev32		b2.16b, b2.16b;                         \
     94	rev32		b3.16b, b3.16b;                         \
     95                                                                \
     96	transpose_4x4(b0, b1, b2, b3);                          \
     97                                                                \
     98	mov		x6, 8;                                  \
     994:                                                              \
    100	ld1		{RKEY.4s}, [x0], #16;                   \
    101	subs		x6, x6, #1;                             \
    102                                                                \
    103	ROUND4(0, b0, b1, b2, b3);                              \
    104	ROUND4(1, b1, b2, b3, b0);                              \
    105	ROUND4(2, b2, b3, b0, b1);                              \
    106	ROUND4(3, b3, b0, b1, b2);                              \
    107                                                                \
    108	bne		4b;                                     \
    109                                                                \
    110	rotate_clockwise_90(b0, b1, b2, b3);                    \
    111	rev32		b0.16b, b0.16b;                         \
    112	rev32		b1.16b, b1.16b;                         \
    113	rev32		b2.16b, b2.16b;                         \
    114	rev32		b3.16b, b3.16b;                         \
    115                                                                \
    116	/* repoint to rkey */                                   \
    117	sub		x0, x0, #128;
    118
    119#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3)           \
    120	/* rk ^ s1 ^ s2 ^ s3 */                                 \
    121	dup		RX0.4s, RKEY.s[round];                  \
    122	eor		RTMP0.16b, s2.16b, s3.16b;              \
    123	mov		RX1.16b, RX0.16b;                       \
    124	eor		RTMP1.16b, t2.16b, t3.16b;              \
    125	eor		RX0.16b, RX0.16b, s1.16b;               \
    126	eor		RX1.16b, RX1.16b, t1.16b;               \
    127	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
    128	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
    129                                                                \
    130	/* sbox, non-linear part */                             \
    131	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
    132	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
    133	tbl		RTMP1.16b, {v16.16b-v19.16b}, RX1.16b;  \
    134	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
    135	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
    136	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
    137	tbx		RTMP1.16b, {v20.16b-v23.16b}, RX1.16b;  \
    138	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
    139	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
    140	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
    141	tbx		RTMP1.16b, {v24.16b-v27.16b}, RX1.16b;  \
    142	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
    143	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
    144	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
    145	tbx		RTMP1.16b, {v28.16b-v31.16b}, RX1.16b;  \
    146                                                                \
    147	/* linear part */                                       \
    148	shl		RX0.4s, RTMP0.4s, #8;                   \
    149	shl		RX1.4s, RTMP1.4s, #8;                   \
    150	shl		RTMP2.4s, RTMP0.4s, #16;                \
    151	shl		RTMP3.4s, RTMP1.4s, #16;                \
    152	sri		RX0.4s, RTMP0.4s, #(32 - 8);            \
    153	sri		RX1.4s, RTMP1.4s, #(32 - 8);            \
    154	sri		RTMP2.4s, RTMP0.4s, #(32 - 16);         \
    155	sri		RTMP3.4s, RTMP1.4s, #(32 - 16);         \
    156	/* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */               \
    157	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
    158	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
    159	eor		RX0.16b, RX0.16b, RTMP2.16b;            \
    160	eor		RX1.16b, RX1.16b, RTMP3.16b;            \
    161	/* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */        \
    162	shl		RTMP2.4s, RTMP0.4s, #24;                \
    163	shl		RTMP3.4s, RTMP1.4s, #24;                \
    164	sri		RTMP2.4s, RTMP0.4s, #(32 - 24);         \
    165	sri		RTMP3.4s, RTMP1.4s, #(32 - 24);         \
    166	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
    167	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
    168	shl		RTMP2.4s, RX0.4s, #2;                   \
    169	shl		RTMP3.4s, RX1.4s, #2;                   \
    170	sri		RTMP2.4s, RX0.4s, #(32 - 2);            \
    171	sri		RTMP3.4s, RX1.4s, #(32 - 2);            \
    172	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
    173	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
    174	/* s0/t0 ^= RTMP0/1 */                                  \
    175	eor		s0.16b, s0.16b, RTMP0.16b;              \
    176	eor		t0.16b, t0.16b, RTMP1.16b;
    177
    178#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7)          \
    179	rev32		b0.16b, b0.16b;                         \
    180	rev32		b1.16b, b1.16b;                         \
    181	rev32		b2.16b, b2.16b;                         \
    182	rev32		b3.16b, b3.16b;                         \
    183	rev32		b4.16b, b4.16b;                         \
    184	rev32		b5.16b, b5.16b;                         \
    185	rev32		b6.16b, b6.16b;                         \
    186	rev32		b7.16b, b7.16b;                         \
    187                                                                \
    188	transpose_4x4(b0, b1, b2, b3);                          \
    189	transpose_4x4(b4, b5, b6, b7);                          \
    190                                                                \
    191	mov		x6, 8;                                  \
    1928:                                                              \
    193	ld1		{RKEY.4s}, [x0], #16;                   \
    194	subs		x6, x6, #1;                             \
    195                                                                \
    196	ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7);              \
    197	ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4);              \
    198	ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5);              \
    199	ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6);              \
    200                                                                \
    201	bne		8b;                                     \
    202                                                                \
    203	rotate_clockwise_90(b0, b1, b2, b3);                    \
    204	rotate_clockwise_90(b4, b5, b6, b7);                    \
    205	rev32		b0.16b, b0.16b;                         \
    206	rev32		b1.16b, b1.16b;                         \
    207	rev32		b2.16b, b2.16b;                         \
    208	rev32		b3.16b, b3.16b;                         \
    209	rev32		b4.16b, b4.16b;                         \
    210	rev32		b5.16b, b5.16b;                         \
    211	rev32		b6.16b, b6.16b;                         \
    212	rev32		b7.16b, b7.16b;                         \
    213                                                                \
    214	/* repoint to rkey */                                   \
    215	sub		x0, x0, #128;
    216
    217
    218.align 3
    219SYM_FUNC_START_LOCAL(__sm4_neon_crypt_blk1_4)
    220	/* input:
    221	 *   x0: round key array, CTX
    222	 *   x1: dst
    223	 *   x2: src
    224	 *   w3: num blocks (1..4)
    225	 */
    226	PREPARE;
    227
    228	ld1		{v0.16b}, [x2], #16;
    229	mov		v1.16b, v0.16b;
    230	mov		v2.16b, v0.16b;
    231	mov		v3.16b, v0.16b;
    232	cmp		w3, #2;
    233	blt		.Lblk4_load_input_done;
    234	ld1		{v1.16b}, [x2], #16;
    235	beq		.Lblk4_load_input_done;
    236	ld1		{v2.16b}, [x2], #16;
    237	cmp		w3, #3;
    238	beq		.Lblk4_load_input_done;
    239	ld1		{v3.16b}, [x2];
    240
    241.Lblk4_load_input_done:
    242	SM4_CRYPT_BLK4(v0, v1, v2, v3);
    243
    244	st1		{v0.16b}, [x1], #16;
    245	cmp		w3, #2;
    246	blt		.Lblk4_store_output_done;
    247	st1		{v1.16b}, [x1], #16;
    248	beq		.Lblk4_store_output_done;
    249	st1		{v2.16b}, [x1], #16;
    250	cmp		w3, #3;
    251	beq		.Lblk4_store_output_done;
    252	st1		{v3.16b}, [x1];
    253
    254.Lblk4_store_output_done:
    255	ret;
    256SYM_FUNC_END(__sm4_neon_crypt_blk1_4)
    257
    258.align 3
    259SYM_FUNC_START(sm4_neon_crypt_blk1_8)
    260	/* input:
    261	 *   x0: round key array, CTX
    262	 *   x1: dst
    263	 *   x2: src
    264	 *   w3: num blocks (1..8)
    265	 */
    266	cmp		w3, #5;
    267	blt		__sm4_neon_crypt_blk1_4;
    268
    269	PREPARE;
    270
    271	ld1		{v0.16b-v3.16b}, [x2], #64;
    272	ld1		{v4.16b}, [x2], #16;
    273	mov		v5.16b, v4.16b;
    274	mov		v6.16b, v4.16b;
    275	mov		v7.16b, v4.16b;
    276	beq		.Lblk8_load_input_done;
    277	ld1		{v5.16b}, [x2], #16;
    278	cmp		w3, #7;
    279	blt		.Lblk8_load_input_done;
    280	ld1		{v6.16b}, [x2], #16;
    281	beq		.Lblk8_load_input_done;
    282	ld1		{v7.16b}, [x2];
    283
    284.Lblk8_load_input_done:
    285	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
    286
    287	cmp		w3, #6;
    288	st1		{v0.16b-v3.16b}, [x1], #64;
    289	st1		{v4.16b}, [x1], #16;
    290	blt		.Lblk8_store_output_done;
    291	st1		{v5.16b}, [x1], #16;
    292	beq		.Lblk8_store_output_done;
    293	st1		{v6.16b}, [x1], #16;
    294	cmp		w3, #7;
    295	beq		.Lblk8_store_output_done;
    296	st1		{v7.16b}, [x1];
    297
    298.Lblk8_store_output_done:
    299	ret;
    300SYM_FUNC_END(sm4_neon_crypt_blk1_8)
    301
    302.align 3
    303SYM_FUNC_START(sm4_neon_crypt_blk8)
    304	/* input:
    305	 *   x0: round key array, CTX
    306	 *   x1: dst
    307	 *   x2: src
    308	 *   w3: nblocks (multiples of 8)
    309	 */
    310	PREPARE;
    311
    312.Lcrypt_loop_blk:
    313	subs		w3, w3, #8;
    314	bmi		.Lcrypt_end;
    315
    316	ld1		{v0.16b-v3.16b}, [x2], #64;
    317	ld1		{v4.16b-v7.16b}, [x2], #64;
    318
    319	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
    320
    321	st1		{v0.16b-v3.16b}, [x1], #64;
    322	st1		{v4.16b-v7.16b}, [x1], #64;
    323
    324	b		.Lcrypt_loop_blk;
    325
    326.Lcrypt_end:
    327	ret;
    328SYM_FUNC_END(sm4_neon_crypt_blk8)
    329
    330.align 3
    331SYM_FUNC_START(sm4_neon_cbc_dec_blk8)
    332	/* input:
    333	 *   x0: round key array, CTX
    334	 *   x1: dst
    335	 *   x2: src
    336	 *   x3: iv (big endian, 128 bit)
    337	 *   w4: nblocks (multiples of 8)
    338	 */
    339	PREPARE;
    340
    341	ld1		{RIV.16b}, [x3];
    342
    343.Lcbc_loop_blk:
    344	subs		w4, w4, #8;
    345	bmi		.Lcbc_end;
    346
    347	ld1		{v0.16b-v3.16b}, [x2], #64;
    348	ld1		{v4.16b-v7.16b}, [x2];
    349
    350	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
    351
    352	sub		x2, x2, #64;
    353	eor		v0.16b, v0.16b, RIV.16b;
    354	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    355	eor		v1.16b, v1.16b, RTMP0.16b;
    356	eor		v2.16b, v2.16b, RTMP1.16b;
    357	eor		v3.16b, v3.16b, RTMP2.16b;
    358	st1		{v0.16b-v3.16b}, [x1], #64;
    359
    360	eor		v4.16b, v4.16b, RTMP3.16b;
    361	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    362	eor		v5.16b, v5.16b, RTMP0.16b;
    363	eor		v6.16b, v6.16b, RTMP1.16b;
    364	eor		v7.16b, v7.16b, RTMP2.16b;
    365
    366	mov		RIV.16b, RTMP3.16b;
    367	st1		{v4.16b-v7.16b}, [x1], #64;
    368
    369	b		.Lcbc_loop_blk;
    370
    371.Lcbc_end:
    372	/* store new IV */
    373	st1		{RIV.16b}, [x3];
    374
    375	ret;
    376SYM_FUNC_END(sm4_neon_cbc_dec_blk8)
    377
    378.align 3
    379SYM_FUNC_START(sm4_neon_cfb_dec_blk8)
    380	/* input:
    381	 *   x0: round key array, CTX
    382	 *   x1: dst
    383	 *   x2: src
    384	 *   x3: iv (big endian, 128 bit)
    385	 *   w4: nblocks (multiples of 8)
    386	 */
    387	PREPARE;
    388
    389	ld1		{v0.16b}, [x3];
    390
    391.Lcfb_loop_blk:
    392	subs		w4, w4, #8;
    393	bmi		.Lcfb_end;
    394
    395	ld1		{v1.16b, v2.16b, v3.16b}, [x2], #48;
    396	ld1		{v4.16b-v7.16b}, [x2];
    397
    398	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
    399
    400	sub		x2, x2, #48;
    401	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    402	eor		v0.16b, v0.16b, RTMP0.16b;
    403	eor		v1.16b, v1.16b, RTMP1.16b;
    404	eor		v2.16b, v2.16b, RTMP2.16b;
    405	eor		v3.16b, v3.16b, RTMP3.16b;
    406	st1		{v0.16b-v3.16b}, [x1], #64;
    407
    408	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    409	eor		v4.16b, v4.16b, RTMP0.16b;
    410	eor		v5.16b, v5.16b, RTMP1.16b;
    411	eor		v6.16b, v6.16b, RTMP2.16b;
    412	eor		v7.16b, v7.16b, RTMP3.16b;
    413	st1		{v4.16b-v7.16b}, [x1], #64;
    414
    415	mov		v0.16b, RTMP3.16b;
    416
    417	b		.Lcfb_loop_blk;
    418
    419.Lcfb_end:
    420	/* store new IV */
    421	st1		{v0.16b}, [x3];
    422
    423	ret;
    424SYM_FUNC_END(sm4_neon_cfb_dec_blk8)
    425
    426.align 3
    427SYM_FUNC_START(sm4_neon_ctr_enc_blk8)
    428	/* input:
    429	 *   x0: round key array, CTX
    430	 *   x1: dst
    431	 *   x2: src
    432	 *   x3: ctr (big endian, 128 bit)
    433	 *   w4: nblocks (multiples of 8)
    434	 */
    435	PREPARE;
    436
    437	ldp		x7, x8, [x3];
    438	rev		x7, x7;
    439	rev		x8, x8;
    440
    441.Lctr_loop_blk:
    442	subs		w4, w4, #8;
    443	bmi		.Lctr_end;
    444
    445#define inc_le128(vctr)                     \
    446	mov		vctr.d[1], x8;      \
    447	mov		vctr.d[0], x7;      \
    448	adds		x8, x8, #1;         \
    449	adc		x7, x7, xzr;        \
    450	rev64		vctr.16b, vctr.16b;
    451
    452	/* construct CTRs */
    453	inc_le128(v0);			/* +0 */
    454	inc_le128(v1);			/* +1 */
    455	inc_le128(v2);			/* +2 */
    456	inc_le128(v3);			/* +3 */
    457	inc_le128(v4);			/* +4 */
    458	inc_le128(v5);			/* +5 */
    459	inc_le128(v6);			/* +6 */
    460	inc_le128(v7);			/* +7 */
    461
    462	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
    463
    464	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    465	eor		v0.16b, v0.16b, RTMP0.16b;
    466	eor		v1.16b, v1.16b, RTMP1.16b;
    467	eor		v2.16b, v2.16b, RTMP2.16b;
    468	eor		v3.16b, v3.16b, RTMP3.16b;
    469	st1		{v0.16b-v3.16b}, [x1], #64;
    470
    471	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    472	eor		v4.16b, v4.16b, RTMP0.16b;
    473	eor		v5.16b, v5.16b, RTMP1.16b;
    474	eor		v6.16b, v6.16b, RTMP2.16b;
    475	eor		v7.16b, v7.16b, RTMP3.16b;
    476	st1		{v4.16b-v7.16b}, [x1], #64;
    477
    478	b		.Lctr_loop_blk;
    479
    480.Lctr_end:
    481	/* store new CTR */
    482	rev		x7, x7;
    483	rev		x8, x8;
    484	stp		x7, x8, [x3];
    485
    486	ret;
    487SYM_FUNC_END(sm4_neon_ctr_enc_blk8)