cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sm4-ce-core.S (15378B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
      4 * as specified in
      5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
      6 *
      7 * Copyright (C) 2022, Alibaba Group.
      8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
      9 */
     10
     11#include <linux/linkage.h>
     12#include <asm/assembler.h>
     13
     14.arch	armv8-a+crypto
     15
     16.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31
     17	.set .Lv\b\().4s, \b
     18.endr
     19
     20.macro sm4e, vd, vn
     21	.inst 0xcec08400 | (.L\vn << 5) | .L\vd
     22.endm
     23
     24.macro sm4ekey, vd, vn, vm
     25	.inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
     26.endm
     27
     28/* Register macros */
     29
     30#define RTMP0	v16
     31#define RTMP1	v17
     32#define RTMP2	v18
     33#define RTMP3	v19
     34
     35#define RIV	v20
     36
     37/* Helper macros. */
     38
     39#define PREPARE                                       \
     40	ld1		{v24.16b-v27.16b}, [x0], #64; \
     41	ld1		{v28.16b-v31.16b}, [x0];
     42
     43#define SM4_CRYPT_BLK(b0)                           \
     44	rev32		b0.16b, b0.16b;             \
     45	sm4e		b0.4s, v24.4s;              \
     46	sm4e		b0.4s, v25.4s;              \
     47	sm4e		b0.4s, v26.4s;              \
     48	sm4e		b0.4s, v27.4s;              \
     49	sm4e		b0.4s, v28.4s;              \
     50	sm4e		b0.4s, v29.4s;              \
     51	sm4e		b0.4s, v30.4s;              \
     52	sm4e		b0.4s, v31.4s;              \
     53	rev64		b0.4s, b0.4s;               \
     54	ext		b0.16b, b0.16b, b0.16b, #8; \
     55	rev32		b0.16b, b0.16b;
     56
     57#define SM4_CRYPT_BLK4(b0, b1, b2, b3)              \
     58	rev32		b0.16b, b0.16b;             \
     59	rev32		b1.16b, b1.16b;             \
     60	rev32		b2.16b, b2.16b;             \
     61	rev32		b3.16b, b3.16b;             \
     62	sm4e		b0.4s, v24.4s;              \
     63	sm4e		b1.4s, v24.4s;              \
     64	sm4e		b2.4s, v24.4s;              \
     65	sm4e		b3.4s, v24.4s;              \
     66	sm4e		b0.4s, v25.4s;              \
     67	sm4e		b1.4s, v25.4s;              \
     68	sm4e		b2.4s, v25.4s;              \
     69	sm4e		b3.4s, v25.4s;              \
     70	sm4e		b0.4s, v26.4s;              \
     71	sm4e		b1.4s, v26.4s;              \
     72	sm4e		b2.4s, v26.4s;              \
     73	sm4e		b3.4s, v26.4s;              \
     74	sm4e		b0.4s, v27.4s;              \
     75	sm4e		b1.4s, v27.4s;              \
     76	sm4e		b2.4s, v27.4s;              \
     77	sm4e		b3.4s, v27.4s;              \
     78	sm4e		b0.4s, v28.4s;              \
     79	sm4e		b1.4s, v28.4s;              \
     80	sm4e		b2.4s, v28.4s;              \
     81	sm4e		b3.4s, v28.4s;              \
     82	sm4e		b0.4s, v29.4s;              \
     83	sm4e		b1.4s, v29.4s;              \
     84	sm4e		b2.4s, v29.4s;              \
     85	sm4e		b3.4s, v29.4s;              \
     86	sm4e		b0.4s, v30.4s;              \
     87	sm4e		b1.4s, v30.4s;              \
     88	sm4e		b2.4s, v30.4s;              \
     89	sm4e		b3.4s, v30.4s;              \
     90	sm4e		b0.4s, v31.4s;              \
     91	sm4e		b1.4s, v31.4s;              \
     92	sm4e		b2.4s, v31.4s;              \
     93	sm4e		b3.4s, v31.4s;              \
     94	rev64		b0.4s, b0.4s;               \
     95	rev64		b1.4s, b1.4s;               \
     96	rev64		b2.4s, b2.4s;               \
     97	rev64		b3.4s, b3.4s;               \
     98	ext		b0.16b, b0.16b, b0.16b, #8; \
     99	ext		b1.16b, b1.16b, b1.16b, #8; \
    100	ext		b2.16b, b2.16b, b2.16b, #8; \
    101	ext		b3.16b, b3.16b, b3.16b, #8; \
    102	rev32		b0.16b, b0.16b;             \
    103	rev32		b1.16b, b1.16b;             \
    104	rev32		b2.16b, b2.16b;             \
    105	rev32		b3.16b, b3.16b;
    106
    107#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
    108	rev32		b0.16b, b0.16b;             \
    109	rev32		b1.16b, b1.16b;             \
    110	rev32		b2.16b, b2.16b;             \
    111	rev32		b3.16b, b3.16b;             \
    112	rev32		b4.16b, b4.16b;             \
    113	rev32		b5.16b, b5.16b;             \
    114	rev32		b6.16b, b6.16b;             \
    115	rev32		b7.16b, b7.16b;             \
    116	sm4e		b0.4s, v24.4s;              \
    117	sm4e		b1.4s, v24.4s;              \
    118	sm4e		b2.4s, v24.4s;              \
    119	sm4e		b3.4s, v24.4s;              \
    120	sm4e		b4.4s, v24.4s;              \
    121	sm4e		b5.4s, v24.4s;              \
    122	sm4e		b6.4s, v24.4s;              \
    123	sm4e		b7.4s, v24.4s;              \
    124	sm4e		b0.4s, v25.4s;              \
    125	sm4e		b1.4s, v25.4s;              \
    126	sm4e		b2.4s, v25.4s;              \
    127	sm4e		b3.4s, v25.4s;              \
    128	sm4e		b4.4s, v25.4s;              \
    129	sm4e		b5.4s, v25.4s;              \
    130	sm4e		b6.4s, v25.4s;              \
    131	sm4e		b7.4s, v25.4s;              \
    132	sm4e		b0.4s, v26.4s;              \
    133	sm4e		b1.4s, v26.4s;              \
    134	sm4e		b2.4s, v26.4s;              \
    135	sm4e		b3.4s, v26.4s;              \
    136	sm4e		b4.4s, v26.4s;              \
    137	sm4e		b5.4s, v26.4s;              \
    138	sm4e		b6.4s, v26.4s;              \
    139	sm4e		b7.4s, v26.4s;              \
    140	sm4e		b0.4s, v27.4s;              \
    141	sm4e		b1.4s, v27.4s;              \
    142	sm4e		b2.4s, v27.4s;              \
    143	sm4e		b3.4s, v27.4s;              \
    144	sm4e		b4.4s, v27.4s;              \
    145	sm4e		b5.4s, v27.4s;              \
    146	sm4e		b6.4s, v27.4s;              \
    147	sm4e		b7.4s, v27.4s;              \
    148	sm4e		b0.4s, v28.4s;              \
    149	sm4e		b1.4s, v28.4s;              \
    150	sm4e		b2.4s, v28.4s;              \
    151	sm4e		b3.4s, v28.4s;              \
    152	sm4e		b4.4s, v28.4s;              \
    153	sm4e		b5.4s, v28.4s;              \
    154	sm4e		b6.4s, v28.4s;              \
    155	sm4e		b7.4s, v28.4s;              \
    156	sm4e		b0.4s, v29.4s;              \
    157	sm4e		b1.4s, v29.4s;              \
    158	sm4e		b2.4s, v29.4s;              \
    159	sm4e		b3.4s, v29.4s;              \
    160	sm4e		b4.4s, v29.4s;              \
    161	sm4e		b5.4s, v29.4s;              \
    162	sm4e		b6.4s, v29.4s;              \
    163	sm4e		b7.4s, v29.4s;              \
    164	sm4e		b0.4s, v30.4s;              \
    165	sm4e		b1.4s, v30.4s;              \
    166	sm4e		b2.4s, v30.4s;              \
    167	sm4e		b3.4s, v30.4s;              \
    168	sm4e		b4.4s, v30.4s;              \
    169	sm4e		b5.4s, v30.4s;              \
    170	sm4e		b6.4s, v30.4s;              \
    171	sm4e		b7.4s, v30.4s;              \
    172	sm4e		b0.4s, v31.4s;              \
    173	sm4e		b1.4s, v31.4s;              \
    174	sm4e		b2.4s, v31.4s;              \
    175	sm4e		b3.4s, v31.4s;              \
    176	sm4e		b4.4s, v31.4s;              \
    177	sm4e		b5.4s, v31.4s;              \
    178	sm4e		b6.4s, v31.4s;              \
    179	sm4e		b7.4s, v31.4s;              \
    180	rev64		b0.4s, b0.4s;               \
    181	rev64		b1.4s, b1.4s;               \
    182	rev64		b2.4s, b2.4s;               \
    183	rev64		b3.4s, b3.4s;               \
    184	rev64		b4.4s, b4.4s;               \
    185	rev64		b5.4s, b5.4s;               \
    186	rev64		b6.4s, b6.4s;               \
    187	rev64		b7.4s, b7.4s;               \
    188	ext		b0.16b, b0.16b, b0.16b, #8; \
    189	ext		b1.16b, b1.16b, b1.16b, #8; \
    190	ext		b2.16b, b2.16b, b2.16b, #8; \
    191	ext		b3.16b, b3.16b, b3.16b, #8; \
    192	ext		b4.16b, b4.16b, b4.16b, #8; \
    193	ext		b5.16b, b5.16b, b5.16b, #8; \
    194	ext		b6.16b, b6.16b, b6.16b, #8; \
    195	ext		b7.16b, b7.16b, b7.16b, #8; \
    196	rev32		b0.16b, b0.16b;             \
    197	rev32		b1.16b, b1.16b;             \
    198	rev32		b2.16b, b2.16b;             \
    199	rev32		b3.16b, b3.16b;             \
    200	rev32		b4.16b, b4.16b;             \
    201	rev32		b5.16b, b5.16b;             \
    202	rev32		b6.16b, b6.16b;             \
    203	rev32		b7.16b, b7.16b;
    204
    205
    206.align 3
    207SYM_FUNC_START(sm4_ce_expand_key)
    208	/* input:
    209	 *   x0: 128-bit key
    210	 *   x1: rkey_enc
    211	 *   x2: rkey_dec
    212	 *   x3: fk array
    213	 *   x4: ck array
    214	 */
    215	ld1		{v0.16b}, [x0];
    216	rev32		v0.16b, v0.16b;
    217	ld1		{v1.16b}, [x3];
    218	/* load ck */
    219	ld1		{v24.16b-v27.16b}, [x4], #64;
    220	ld1		{v28.16b-v31.16b}, [x4];
    221
    222	/* input ^ fk */
    223	eor		v0.16b, v0.16b, v1.16b;
    224
    225	sm4ekey		v0.4s, v0.4s, v24.4s;
    226	sm4ekey		v1.4s, v0.4s, v25.4s;
    227	sm4ekey		v2.4s, v1.4s, v26.4s;
    228	sm4ekey		v3.4s, v2.4s, v27.4s;
    229	sm4ekey		v4.4s, v3.4s, v28.4s;
    230	sm4ekey		v5.4s, v4.4s, v29.4s;
    231	sm4ekey		v6.4s, v5.4s, v30.4s;
    232	sm4ekey		v7.4s, v6.4s, v31.4s;
    233
    234	st1		{v0.16b-v3.16b}, [x1], #64;
    235	st1		{v4.16b-v7.16b}, [x1];
    236	rev64		v7.4s, v7.4s;
    237	rev64		v6.4s, v6.4s;
    238	rev64		v5.4s, v5.4s;
    239	rev64		v4.4s, v4.4s;
    240	rev64		v3.4s, v3.4s;
    241	rev64		v2.4s, v2.4s;
    242	rev64		v1.4s, v1.4s;
    243	rev64		v0.4s, v0.4s;
    244	ext		v7.16b, v7.16b, v7.16b, #8;
    245	ext		v6.16b, v6.16b, v6.16b, #8;
    246	ext		v5.16b, v5.16b, v5.16b, #8;
    247	ext		v4.16b, v4.16b, v4.16b, #8;
    248	ext		v3.16b, v3.16b, v3.16b, #8;
    249	ext		v2.16b, v2.16b, v2.16b, #8;
    250	ext		v1.16b, v1.16b, v1.16b, #8;
    251	ext		v0.16b, v0.16b, v0.16b, #8;
    252	st1		{v7.16b}, [x2], #16;
    253	st1		{v6.16b}, [x2], #16;
    254	st1		{v5.16b}, [x2], #16;
    255	st1		{v4.16b}, [x2], #16;
    256	st1		{v3.16b}, [x2], #16;
    257	st1		{v2.16b}, [x2], #16;
    258	st1		{v1.16b}, [x2], #16;
    259	st1		{v0.16b}, [x2];
    260
    261	ret;
    262SYM_FUNC_END(sm4_ce_expand_key)
    263
    264.align 3
    265SYM_FUNC_START(sm4_ce_crypt_block)
    266	/* input:
    267	 *   x0: round key array, CTX
    268	 *   x1: dst
    269	 *   x2: src
    270	 */
    271	PREPARE;
    272
    273	ld1		{v0.16b}, [x2];
    274	SM4_CRYPT_BLK(v0);
    275	st1		{v0.16b}, [x1];
    276
    277	ret;
    278SYM_FUNC_END(sm4_ce_crypt_block)
    279
    280.align 3
    281SYM_FUNC_START(sm4_ce_crypt)
    282	/* input:
    283	 *   x0: round key array, CTX
    284	 *   x1: dst
    285	 *   x2: src
    286	 *   w3: nblocks
    287	 */
    288	PREPARE;
    289
    290.Lcrypt_loop_blk:
    291	sub		w3, w3, #8;
    292	tbnz		w3, #31, .Lcrypt_tail8;
    293
    294	ld1		{v0.16b-v3.16b}, [x2], #64;
    295	ld1		{v4.16b-v7.16b}, [x2], #64;
    296
    297	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
    298
    299	st1		{v0.16b-v3.16b}, [x1], #64;
    300	st1		{v4.16b-v7.16b}, [x1], #64;
    301
    302	cbz		w3, .Lcrypt_end;
    303	b		.Lcrypt_loop_blk;
    304
    305.Lcrypt_tail8:
    306	add		w3, w3, #8;
    307	cmp		w3, #4;
    308	blt		.Lcrypt_tail4;
    309
    310	sub		w3, w3, #4;
    311
    312	ld1		{v0.16b-v3.16b}, [x2], #64;
    313	SM4_CRYPT_BLK4(v0, v1, v2, v3);
    314	st1		{v0.16b-v3.16b}, [x1], #64;
    315
    316	cbz		w3, .Lcrypt_end;
    317
    318.Lcrypt_tail4:
    319	sub		w3, w3, #1;
    320
    321	ld1		{v0.16b}, [x2], #16;
    322	SM4_CRYPT_BLK(v0);
    323	st1		{v0.16b}, [x1], #16;
    324
    325	cbnz		w3, .Lcrypt_tail4;
    326
    327.Lcrypt_end:
    328	ret;
    329SYM_FUNC_END(sm4_ce_crypt)
    330
    331.align 3
    332SYM_FUNC_START(sm4_ce_cbc_enc)
    333	/* input:
    334	 *   x0: round key array, CTX
    335	 *   x1: dst
    336	 *   x2: src
    337	 *   x3: iv (big endian, 128 bit)
    338	 *   w4: nblocks
    339	 */
    340	PREPARE;
    341
    342	ld1		{RIV.16b}, [x3];
    343
    344.Lcbc_enc_loop:
    345	sub		w4, w4, #1;
    346
    347	ld1		{RTMP0.16b}, [x2], #16;
    348	eor		RIV.16b, RIV.16b, RTMP0.16b;
    349
    350	SM4_CRYPT_BLK(RIV);
    351
    352	st1		{RIV.16b}, [x1], #16;
    353
    354	cbnz		w4, .Lcbc_enc_loop;
    355
    356	/* store new IV */
    357	st1		{RIV.16b}, [x3];
    358
    359	ret;
    360SYM_FUNC_END(sm4_ce_cbc_enc)
    361
    362.align 3
    363SYM_FUNC_START(sm4_ce_cbc_dec)
    364	/* input:
    365	 *   x0: round key array, CTX
    366	 *   x1: dst
    367	 *   x2: src
    368	 *   x3: iv (big endian, 128 bit)
    369	 *   w4: nblocks
    370	 */
    371	PREPARE;
    372
    373	ld1		{RIV.16b}, [x3];
    374
    375.Lcbc_loop_blk:
    376	sub		w4, w4, #8;
    377	tbnz		w4, #31, .Lcbc_tail8;
    378
    379	ld1		{v0.16b-v3.16b}, [x2], #64;
    380	ld1		{v4.16b-v7.16b}, [x2];
    381
    382	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
    383
    384	sub		x2, x2, #64;
    385	eor		v0.16b, v0.16b, RIV.16b;
    386	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    387	eor		v1.16b, v1.16b, RTMP0.16b;
    388	eor		v2.16b, v2.16b, RTMP1.16b;
    389	eor		v3.16b, v3.16b, RTMP2.16b;
    390	st1		{v0.16b-v3.16b}, [x1], #64;
    391
    392	eor		v4.16b, v4.16b, RTMP3.16b;
    393	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    394	eor		v5.16b, v5.16b, RTMP0.16b;
    395	eor		v6.16b, v6.16b, RTMP1.16b;
    396	eor		v7.16b, v7.16b, RTMP2.16b;
    397
    398	mov		RIV.16b, RTMP3.16b;
    399	st1		{v4.16b-v7.16b}, [x1], #64;
    400
    401	cbz		w4, .Lcbc_end;
    402	b		.Lcbc_loop_blk;
    403
    404.Lcbc_tail8:
    405	add		w4, w4, #8;
    406	cmp		w4, #4;
    407	blt		.Lcbc_tail4;
    408
    409	sub		w4, w4, #4;
    410
    411	ld1		{v0.16b-v3.16b}, [x2];
    412
    413	SM4_CRYPT_BLK4(v0, v1, v2, v3);
    414
    415	eor		v0.16b, v0.16b, RIV.16b;
    416	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    417	eor		v1.16b, v1.16b, RTMP0.16b;
    418	eor		v2.16b, v2.16b, RTMP1.16b;
    419	eor		v3.16b, v3.16b, RTMP2.16b;
    420
    421	mov		RIV.16b, RTMP3.16b;
    422	st1		{v0.16b-v3.16b}, [x1], #64;
    423
    424	cbz		w4, .Lcbc_end;
    425
    426.Lcbc_tail4:
    427	sub		w4, w4, #1;
    428
    429	ld1		{v0.16b}, [x2];
    430
    431	SM4_CRYPT_BLK(v0);
    432
    433	eor		v0.16b, v0.16b, RIV.16b;
    434	ld1		{RIV.16b}, [x2], #16;
    435	st1		{v0.16b}, [x1], #16;
    436
    437	cbnz		w4, .Lcbc_tail4;
    438
    439.Lcbc_end:
    440	/* store new IV */
    441	st1		{RIV.16b}, [x3];
    442
    443	ret;
    444SYM_FUNC_END(sm4_ce_cbc_dec)
    445
    446.align 3
    447SYM_FUNC_START(sm4_ce_cfb_enc)
    448	/* input:
    449	 *   x0: round key array, CTX
    450	 *   x1: dst
    451	 *   x2: src
    452	 *   x3: iv (big endian, 128 bit)
    453	 *   w4: nblocks
    454	 */
    455	PREPARE;
    456
    457	ld1		{RIV.16b}, [x3];
    458
    459.Lcfb_enc_loop:
    460	sub		w4, w4, #1;
    461
    462	SM4_CRYPT_BLK(RIV);
    463
    464	ld1		{RTMP0.16b}, [x2], #16;
    465	eor		RIV.16b, RIV.16b, RTMP0.16b;
    466	st1		{RIV.16b}, [x1], #16;
    467
    468	cbnz		w4, .Lcfb_enc_loop;
    469
    470	/* store new IV */
    471	st1		{RIV.16b}, [x3];
    472
    473	ret;
    474SYM_FUNC_END(sm4_ce_cfb_enc)
    475
    476.align 3
    477SYM_FUNC_START(sm4_ce_cfb_dec)
    478	/* input:
    479	 *   x0: round key array, CTX
    480	 *   x1: dst
    481	 *   x2: src
    482	 *   x3: iv (big endian, 128 bit)
    483	 *   w4: nblocks
    484	 */
    485	PREPARE;
    486
    487	ld1		{v0.16b}, [x3];
    488
    489.Lcfb_loop_blk:
    490	sub		w4, w4, #8;
    491	tbnz		w4, #31, .Lcfb_tail8;
    492
    493	ld1		{v1.16b, v2.16b, v3.16b}, [x2], #48;
    494	ld1		{v4.16b-v7.16b}, [x2];
    495
    496	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
    497
    498	sub		x2, x2, #48;
    499	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    500	eor		v0.16b, v0.16b, RTMP0.16b;
    501	eor		v1.16b, v1.16b, RTMP1.16b;
    502	eor		v2.16b, v2.16b, RTMP2.16b;
    503	eor		v3.16b, v3.16b, RTMP3.16b;
    504	st1		{v0.16b-v3.16b}, [x1], #64;
    505
    506	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    507	eor		v4.16b, v4.16b, RTMP0.16b;
    508	eor		v5.16b, v5.16b, RTMP1.16b;
    509	eor		v6.16b, v6.16b, RTMP2.16b;
    510	eor		v7.16b, v7.16b, RTMP3.16b;
    511	st1		{v4.16b-v7.16b}, [x1], #64;
    512
    513	mov		v0.16b, RTMP3.16b;
    514
    515	cbz		w4, .Lcfb_end;
    516	b		.Lcfb_loop_blk;
    517
    518.Lcfb_tail8:
    519	add		w4, w4, #8;
    520	cmp		w4, #4;
    521	blt		.Lcfb_tail4;
    522
    523	sub		w4, w4, #4;
    524
    525	ld1		{v1.16b, v2.16b, v3.16b}, [x2];
    526
    527	SM4_CRYPT_BLK4(v0, v1, v2, v3);
    528
    529	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    530	eor		v0.16b, v0.16b, RTMP0.16b;
    531	eor		v1.16b, v1.16b, RTMP1.16b;
    532	eor		v2.16b, v2.16b, RTMP2.16b;
    533	eor		v3.16b, v3.16b, RTMP3.16b;
    534	st1		{v0.16b-v3.16b}, [x1], #64;
    535
    536	mov		v0.16b, RTMP3.16b;
    537
    538	cbz		w4, .Lcfb_end;
    539
    540.Lcfb_tail4:
    541	sub		w4, w4, #1;
    542
    543	SM4_CRYPT_BLK(v0);
    544
    545	ld1		{RTMP0.16b}, [x2], #16;
    546	eor		v0.16b, v0.16b, RTMP0.16b;
    547	st1		{v0.16b}, [x1], #16;
    548
    549	mov		v0.16b, RTMP0.16b;
    550
    551	cbnz		w4, .Lcfb_tail4;
    552
    553.Lcfb_end:
    554	/* store new IV */
    555	st1		{v0.16b}, [x3];
    556
    557	ret;
    558SYM_FUNC_END(sm4_ce_cfb_dec)
    559
    560.align 3
    561SYM_FUNC_START(sm4_ce_ctr_enc)
    562	/* input:
    563	 *   x0: round key array, CTX
    564	 *   x1: dst
    565	 *   x2: src
    566	 *   x3: ctr (big endian, 128 bit)
    567	 *   w4: nblocks
    568	 */
    569	PREPARE;
    570
    571	ldp		x7, x8, [x3];
    572	rev		x7, x7;
    573	rev		x8, x8;
    574
    575.Lctr_loop_blk:
    576	sub		w4, w4, #8;
    577	tbnz		w4, #31, .Lctr_tail8;
    578
    579#define inc_le128(vctr)                     \
    580	mov		vctr.d[1], x8;      \
    581	mov		vctr.d[0], x7;      \
    582	adds		x8, x8, #1;         \
    583	adc		x7, x7, xzr;        \
    584	rev64		vctr.16b, vctr.16b;
    585
    586	/* construct CTRs */
    587	inc_le128(v0);			/* +0 */
    588	inc_le128(v1);			/* +1 */
    589	inc_le128(v2);			/* +2 */
    590	inc_le128(v3);			/* +3 */
    591	inc_le128(v4);			/* +4 */
    592	inc_le128(v5);			/* +5 */
    593	inc_le128(v6);			/* +6 */
    594	inc_le128(v7);			/* +7 */
    595
    596	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
    597
    598	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    599	eor		v0.16b, v0.16b, RTMP0.16b;
    600	eor		v1.16b, v1.16b, RTMP1.16b;
    601	eor		v2.16b, v2.16b, RTMP2.16b;
    602	eor		v3.16b, v3.16b, RTMP3.16b;
    603	st1		{v0.16b-v3.16b}, [x1], #64;
    604
    605	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    606	eor		v4.16b, v4.16b, RTMP0.16b;
    607	eor		v5.16b, v5.16b, RTMP1.16b;
    608	eor		v6.16b, v6.16b, RTMP2.16b;
    609	eor		v7.16b, v7.16b, RTMP3.16b;
    610	st1		{v4.16b-v7.16b}, [x1], #64;
    611
    612	cbz		w4, .Lctr_end;
    613	b		.Lctr_loop_blk;
    614
    615.Lctr_tail8:
    616	add		w4, w4, #8;
    617	cmp		w4, #4;
    618	blt		.Lctr_tail4;
    619
    620	sub		w4, w4, #4;
    621
    622	/* construct CTRs */
    623	inc_le128(v0);			/* +0 */
    624	inc_le128(v1);			/* +1 */
    625	inc_le128(v2);			/* +2 */
    626	inc_le128(v3);			/* +3 */
    627
    628	SM4_CRYPT_BLK4(v0, v1, v2, v3);
    629
    630	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
    631	eor		v0.16b, v0.16b, RTMP0.16b;
    632	eor		v1.16b, v1.16b, RTMP1.16b;
    633	eor		v2.16b, v2.16b, RTMP2.16b;
    634	eor		v3.16b, v3.16b, RTMP3.16b;
    635	st1		{v0.16b-v3.16b}, [x1], #64;
    636
    637	cbz		w4, .Lctr_end;
    638
    639.Lctr_tail4:
    640	sub		w4, w4, #1;
    641
    642	/* construct CTRs */
    643	inc_le128(v0);
    644
    645	SM4_CRYPT_BLK(v0);
    646
    647	ld1		{RTMP0.16b}, [x2], #16;
    648	eor		v0.16b, v0.16b, RTMP0.16b;
    649	st1		{v0.16b}, [x1], #16;
    650
    651	cbnz		w4, .Lctr_tail4;
    652
    653.Lctr_end:
    654	/* store new CTR */
    655	rev		x7, x7;
    656	rev		x8, x8;
    657	stp		x7, x8, [x3];
    658
    659	ret;
    660SYM_FUNC_END(sm4_ce_ctr_enc)