cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

serpent-sse2-i586-asm_32.S (13684B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
      4 *
      5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
      6 *
      7 * Based on crypto/serpent.c by
      8 *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
      9 *                2003 Herbert Valerio Riedel <hvr@gnu.org>
     10 */
     11
     12#include <linux/linkage.h>
     13
     14.file "serpent-sse2-i586-asm_32.S"
     15.text
     16
     17#define arg_ctx 4
     18#define arg_dst 8
     19#define arg_src 12
     20#define arg_xor 16
     21
     22/**********************************************************************
     23  4-way SSE2 serpent
     24 **********************************************************************/
     25#define CTX %edx
     26
     27#define RA %xmm0
     28#define RB %xmm1
     29#define RC %xmm2
     30#define RD %xmm3
     31#define RE %xmm4
     32
     33#define RT0 %xmm5
     34#define RT1 %xmm6
     35
     36#define RNOT %xmm7
     37
     38#define get_key(i, j, t) \
     39	movd (4*(i)+(j))*4(CTX), t; \
     40	pshufd $0, t, t;
     41
     42#define K(x0, x1, x2, x3, x4, i) \
     43	get_key(i, 0, x4); \
     44	get_key(i, 1, RT0); \
     45	get_key(i, 2, RT1); \
     46	pxor x4,		x0; \
     47	pxor RT0,		x1; \
     48	pxor RT1,		x2; \
     49	get_key(i, 3, x4); \
     50	pxor x4,		x3;
     51
     52#define LK(x0, x1, x2, x3, x4, i) \
     53	movdqa x0,		x4; \
     54	pslld $13,		x0; \
     55	psrld $(32 - 13),	x4; \
     56	por x4,			x0; \
     57	pxor x0,		x1; \
     58	movdqa x2,		x4; \
     59	pslld $3,		x2; \
     60	psrld $(32 - 3),	x4; \
     61	por x4,			x2; \
     62	pxor x2,		x1; \
     63	movdqa x1,		x4; \
     64	pslld $1,		x1; \
     65	psrld $(32 - 1),	x4; \
     66	por x4,			x1; \
     67	movdqa x0,		x4; \
     68	pslld $3,		x4; \
     69	pxor x2,		x3; \
     70	pxor x4,		x3; \
     71	movdqa x3,		x4; \
     72	pslld $7,		x3; \
     73	psrld $(32 - 7),	x4; \
     74	por x4,			x3; \
     75	movdqa x1,		x4; \
     76	pslld $7,		x4; \
     77	pxor x1,		x0; \
     78	pxor x3,		x0; \
     79	pxor x3,		x2; \
     80	pxor x4,		x2; \
     81	movdqa x0,		x4; \
     82	get_key(i, 1, RT0); \
     83	pxor RT0,		x1; \
     84	get_key(i, 3, RT0); \
     85	pxor RT0,		x3; \
     86	pslld $5,		x0; \
     87	psrld $(32 - 5),	x4; \
     88	por x4,			x0; \
     89	movdqa x2,		x4; \
     90	pslld $22,		x2; \
     91	psrld $(32 - 22),	x4; \
     92	por x4,			x2; \
     93	get_key(i, 0, RT0); \
     94	pxor RT0,		x0; \
     95	get_key(i, 2, RT0); \
     96	pxor RT0,		x2;
     97
     98#define KL(x0, x1, x2, x3, x4, i) \
     99	K(x0, x1, x2, x3, x4, i); \
    100	movdqa x0,		x4; \
    101	psrld $5,		x0; \
    102	pslld $(32 - 5),	x4; \
    103	por x4,			x0; \
    104	movdqa x2,		x4; \
    105	psrld $22,		x2; \
    106	pslld $(32 - 22),	x4; \
    107	por x4,			x2; \
    108	pxor x3,		x2; \
    109	pxor x3,		x0; \
    110	movdqa x1,		x4; \
    111	pslld $7,		x4; \
    112	pxor x1,		x0; \
    113	pxor x4,		x2; \
    114	movdqa x1,		x4; \
    115	psrld $1,		x1; \
    116	pslld $(32 - 1),	x4; \
    117	por x4,			x1; \
    118	movdqa x3,		x4; \
    119	psrld $7,		x3; \
    120	pslld $(32 - 7),	x4; \
    121	por x4,			x3; \
    122	pxor x0,		x1; \
    123	movdqa x0,		x4; \
    124	pslld $3,		x4; \
    125	pxor x4,		x3; \
    126	movdqa x0,		x4; \
    127	psrld $13,		x0; \
    128	pslld $(32 - 13),	x4; \
    129	por x4,			x0; \
    130	pxor x2,		x1; \
    131	pxor x2,		x3; \
    132	movdqa x2,		x4; \
    133	psrld $3,		x2; \
    134	pslld $(32 - 3),	x4; \
    135	por x4,			x2;
    136
    137#define S0(x0, x1, x2, x3, x4) \
    138	movdqa x3,		x4; \
    139	por x0,			x3; \
    140	pxor x4,		x0; \
    141	pxor x2,		x4; \
    142	pxor RNOT,		x4; \
    143	pxor x1,		x3; \
    144	pand x0,		x1; \
    145	pxor x4,		x1; \
    146	pxor x0,		x2; \
    147	pxor x3,		x0; \
    148	por x0,			x4; \
    149	pxor x2,		x0; \
    150	pand x1,		x2; \
    151	pxor x2,		x3; \
    152	pxor RNOT,		x1; \
    153	pxor x4,		x2; \
    154	pxor x2,		x1;
    155
    156#define S1(x0, x1, x2, x3, x4) \
    157	movdqa x1,		x4; \
    158	pxor x0,		x1; \
    159	pxor x3,		x0; \
    160	pxor RNOT,		x3; \
    161	pand x1,		x4; \
    162	por x1,			x0; \
    163	pxor x2,		x3; \
    164	pxor x3,		x0; \
    165	pxor x3,		x1; \
    166	pxor x4,		x3; \
    167	por x4,			x1; \
    168	pxor x2,		x4; \
    169	pand x0,		x2; \
    170	pxor x1,		x2; \
    171	por x0,			x1; \
    172	pxor RNOT,		x0; \
    173	pxor x2,		x0; \
    174	pxor x1,		x4;
    175
    176#define S2(x0, x1, x2, x3, x4) \
    177	pxor RNOT,		x3; \
    178	pxor x0,		x1; \
    179	movdqa x0,		x4; \
    180	pand x2,		x0; \
    181	pxor x3,		x0; \
    182	por x4,			x3; \
    183	pxor x1,		x2; \
    184	pxor x1,		x3; \
    185	pand x0,		x1; \
    186	pxor x2,		x0; \
    187	pand x3,		x2; \
    188	por x1,			x3; \
    189	pxor RNOT,		x0; \
    190	pxor x0,		x3; \
    191	pxor x0,		x4; \
    192	pxor x2,		x0; \
    193	por x2,			x1;
    194
    195#define S3(x0, x1, x2, x3, x4) \
    196	movdqa x1,		x4; \
    197	pxor x3,		x1; \
    198	por x0,			x3; \
    199	pand x0,		x4; \
    200	pxor x2,		x0; \
    201	pxor x1,		x2; \
    202	pand x3,		x1; \
    203	pxor x3,		x2; \
    204	por x4,			x0; \
    205	pxor x3,		x4; \
    206	pxor x0,		x1; \
    207	pand x3,		x0; \
    208	pand x4,		x3; \
    209	pxor x2,		x3; \
    210	por x1,			x4; \
    211	pand x1,		x2; \
    212	pxor x3,		x4; \
    213	pxor x3,		x0; \
    214	pxor x2,		x3;
    215
    216#define S4(x0, x1, x2, x3, x4) \
    217	movdqa x3,		x4; \
    218	pand x0,		x3; \
    219	pxor x4,		x0; \
    220	pxor x2,		x3; \
    221	por x4,			x2; \
    222	pxor x1,		x0; \
    223	pxor x3,		x4; \
    224	por x0,			x2; \
    225	pxor x1,		x2; \
    226	pand x0,		x1; \
    227	pxor x4,		x1; \
    228	pand x2,		x4; \
    229	pxor x3,		x2; \
    230	pxor x0,		x4; \
    231	por x1,			x3; \
    232	pxor RNOT,		x1; \
    233	pxor x0,		x3;
    234
    235#define S5(x0, x1, x2, x3, x4) \
    236	movdqa x1,		x4; \
    237	por x0,			x1; \
    238	pxor x1,		x2; \
    239	pxor RNOT,		x3; \
    240	pxor x0,		x4; \
    241	pxor x2,		x0; \
    242	pand x4,		x1; \
    243	por x3,			x4; \
    244	pxor x0,		x4; \
    245	pand x3,		x0; \
    246	pxor x3,		x1; \
    247	pxor x2,		x3; \
    248	pxor x1,		x0; \
    249	pand x4,		x2; \
    250	pxor x2,		x1; \
    251	pand x0,		x2; \
    252	pxor x2,		x3;
    253
    254#define S6(x0, x1, x2, x3, x4) \
    255	movdqa x1,		x4; \
    256	pxor x0,		x3; \
    257	pxor x2,		x1; \
    258	pxor x0,		x2; \
    259	pand x3,		x0; \
    260	por x3,			x1; \
    261	pxor RNOT,		x4; \
    262	pxor x1,		x0; \
    263	pxor x2,		x1; \
    264	pxor x4,		x3; \
    265	pxor x0,		x4; \
    266	pand x0,		x2; \
    267	pxor x1,		x4; \
    268	pxor x3,		x2; \
    269	pand x1,		x3; \
    270	pxor x0,		x3; \
    271	pxor x2,		x1;
    272
    273#define S7(x0, x1, x2, x3, x4) \
    274	pxor RNOT,		x1; \
    275	movdqa x1,		x4; \
    276	pxor RNOT,		x0; \
    277	pand x2,		x1; \
    278	pxor x3,		x1; \
    279	por x4,			x3; \
    280	pxor x2,		x4; \
    281	pxor x3,		x2; \
    282	pxor x0,		x3; \
    283	por x1,			x0; \
    284	pand x0,		x2; \
    285	pxor x4,		x0; \
    286	pxor x3,		x4; \
    287	pand x0,		x3; \
    288	pxor x1,		x4; \
    289	pxor x4,		x2; \
    290	pxor x1,		x3; \
    291	por x0,			x4; \
    292	pxor x1,		x4;
    293
    294#define SI0(x0, x1, x2, x3, x4) \
    295	movdqa x3,		x4; \
    296	pxor x0,		x1; \
    297	por x1,			x3; \
    298	pxor x1,		x4; \
    299	pxor RNOT,		x0; \
    300	pxor x3,		x2; \
    301	pxor x0,		x3; \
    302	pand x1,		x0; \
    303	pxor x2,		x0; \
    304	pand x3,		x2; \
    305	pxor x4,		x3; \
    306	pxor x3,		x2; \
    307	pxor x3,		x1; \
    308	pand x0,		x3; \
    309	pxor x0,		x1; \
    310	pxor x2,		x0; \
    311	pxor x3,		x4;
    312
    313#define SI1(x0, x1, x2, x3, x4) \
    314	pxor x3,		x1; \
    315	movdqa x0,		x4; \
    316	pxor x2,		x0; \
    317	pxor RNOT,		x2; \
    318	por x1,			x4; \
    319	pxor x3,		x4; \
    320	pand x1,		x3; \
    321	pxor x2,		x1; \
    322	pand x4,		x2; \
    323	pxor x1,		x4; \
    324	por x3,			x1; \
    325	pxor x0,		x3; \
    326	pxor x0,		x2; \
    327	por x4,			x0; \
    328	pxor x4,		x2; \
    329	pxor x0,		x1; \
    330	pxor x1,		x4;
    331
    332#define SI2(x0, x1, x2, x3, x4) \
    333	pxor x1,		x2; \
    334	movdqa x3,		x4; \
    335	pxor RNOT,		x3; \
    336	por x2,			x3; \
    337	pxor x4,		x2; \
    338	pxor x0,		x4; \
    339	pxor x1,		x3; \
    340	por x2,			x1; \
    341	pxor x0,		x2; \
    342	pxor x4,		x1; \
    343	por x3,			x4; \
    344	pxor x3,		x2; \
    345	pxor x2,		x4; \
    346	pand x1,		x2; \
    347	pxor x3,		x2; \
    348	pxor x4,		x3; \
    349	pxor x0,		x4;
    350
    351#define SI3(x0, x1, x2, x3, x4) \
    352	pxor x1,		x2; \
    353	movdqa x1,		x4; \
    354	pand x2,		x1; \
    355	pxor x0,		x1; \
    356	por x4,			x0; \
    357	pxor x3,		x4; \
    358	pxor x3,		x0; \
    359	por x1,			x3; \
    360	pxor x2,		x1; \
    361	pxor x3,		x1; \
    362	pxor x2,		x0; \
    363	pxor x3,		x2; \
    364	pand x1,		x3; \
    365	pxor x0,		x1; \
    366	pand x2,		x0; \
    367	pxor x3,		x4; \
    368	pxor x0,		x3; \
    369	pxor x1,		x0;
    370
    371#define SI4(x0, x1, x2, x3, x4) \
    372	pxor x3,		x2; \
    373	movdqa x0,		x4; \
    374	pand x1,		x0; \
    375	pxor x2,		x0; \
    376	por x3,			x2; \
    377	pxor RNOT,		x4; \
    378	pxor x0,		x1; \
    379	pxor x2,		x0; \
    380	pand x4,		x2; \
    381	pxor x0,		x2; \
    382	por x4,			x0; \
    383	pxor x3,		x0; \
    384	pand x2,		x3; \
    385	pxor x3,		x4; \
    386	pxor x1,		x3; \
    387	pand x0,		x1; \
    388	pxor x1,		x4; \
    389	pxor x3,		x0;
    390
    391#define SI5(x0, x1, x2, x3, x4) \
    392	movdqa x1,		x4; \
    393	por x2,			x1; \
    394	pxor x4,		x2; \
    395	pxor x3,		x1; \
    396	pand x4,		x3; \
    397	pxor x3,		x2; \
    398	por x0,			x3; \
    399	pxor RNOT,		x0; \
    400	pxor x2,		x3; \
    401	por x0,			x2; \
    402	pxor x1,		x4; \
    403	pxor x4,		x2; \
    404	pand x0,		x4; \
    405	pxor x1,		x0; \
    406	pxor x3,		x1; \
    407	pand x2,		x0; \
    408	pxor x3,		x2; \
    409	pxor x2,		x0; \
    410	pxor x4,		x2; \
    411	pxor x3,		x4;
    412
    413#define SI6(x0, x1, x2, x3, x4) \
    414	pxor x2,		x0; \
    415	movdqa x0,		x4; \
    416	pand x3,		x0; \
    417	pxor x3,		x2; \
    418	pxor x2,		x0; \
    419	pxor x1,		x3; \
    420	por x4,			x2; \
    421	pxor x3,		x2; \
    422	pand x0,		x3; \
    423	pxor RNOT,		x0; \
    424	pxor x1,		x3; \
    425	pand x2,		x1; \
    426	pxor x0,		x4; \
    427	pxor x4,		x3; \
    428	pxor x2,		x4; \
    429	pxor x1,		x0; \
    430	pxor x0,		x2;
    431
    432#define SI7(x0, x1, x2, x3, x4) \
    433	movdqa x3,		x4; \
    434	pand x0,		x3; \
    435	pxor x2,		x0; \
    436	por x4,			x2; \
    437	pxor x1,		x4; \
    438	pxor RNOT,		x0; \
    439	por x3,			x1; \
    440	pxor x0,		x4; \
    441	pand x2,		x0; \
    442	pxor x1,		x0; \
    443	pand x2,		x1; \
    444	pxor x2,		x3; \
    445	pxor x3,		x4; \
    446	pand x3,		x2; \
    447	por x0,			x3; \
    448	pxor x4,		x1; \
    449	pxor x4,		x3; \
    450	pand x0,		x4; \
    451	pxor x2,		x4;
    452
    453#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
    454	movdqa x0,		t2; \
    455	punpckldq x1,		x0; \
    456	punpckhdq x1,		t2; \
    457	movdqa x2,		t1; \
    458	punpckhdq x3,		x2; \
    459	punpckldq x3,		t1; \
    460	movdqa x0,		x1; \
    461	punpcklqdq t1,		x0; \
    462	punpckhqdq t1,		x1; \
    463	movdqa t2,		x3; \
    464	punpcklqdq x2,		t2; \
    465	punpckhqdq x2,		x3; \
    466	movdqa t2,		x2;
    467
    468#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
    469	movdqu (0*4*4)(in),	x0; \
    470	movdqu (1*4*4)(in),	x1; \
    471	movdqu (2*4*4)(in),	x2; \
    472	movdqu (3*4*4)(in),	x3; \
    473	\
    474	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
    475
    476#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
    477	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
    478	\
    479	movdqu x0, (0*4*4)(out); \
    480	movdqu x1, (1*4*4)(out); \
    481	movdqu x2, (2*4*4)(out); \
    482	movdqu x3, (3*4*4)(out);
    483
    484#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
    485	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
    486	\
    487	movdqu (0*4*4)(out),	t0; \
    488	pxor t0,		x0; \
    489	movdqu x0,		(0*4*4)(out); \
    490	movdqu (1*4*4)(out),	t0; \
    491	pxor t0,		x1; \
    492	movdqu x1,		(1*4*4)(out); \
    493	movdqu (2*4*4)(out),	t0; \
    494	pxor t0,		x2; \
    495	movdqu x2,		(2*4*4)(out); \
    496	movdqu (3*4*4)(out),	t0; \
    497	pxor t0,		x3; \
    498	movdqu x3,		(3*4*4)(out);
    499
    500SYM_FUNC_START(__serpent_enc_blk_4way)
    501	/* input:
    502	 *	arg_ctx(%esp): ctx, CTX
    503	 *	arg_dst(%esp): dst
    504	 *	arg_src(%esp): src
    505	 *	arg_xor(%esp): bool, if true: xor output
    506	 */
    507
    508	pcmpeqd RNOT, RNOT;
    509
    510	movl arg_ctx(%esp), CTX;
    511
    512	movl arg_src(%esp), %eax;
    513	read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
    514
    515					 K(RA, RB, RC, RD, RE, 0);
    516	S0(RA, RB, RC, RD, RE);		LK(RC, RB, RD, RA, RE, 1);
    517	S1(RC, RB, RD, RA, RE);		LK(RE, RD, RA, RC, RB, 2);
    518	S2(RE, RD, RA, RC, RB);		LK(RB, RD, RE, RC, RA, 3);
    519	S3(RB, RD, RE, RC, RA);		LK(RC, RA, RD, RB, RE, 4);
    520	S4(RC, RA, RD, RB, RE);		LK(RA, RD, RB, RE, RC, 5);
    521	S5(RA, RD, RB, RE, RC);		LK(RC, RA, RD, RE, RB, 6);
    522	S6(RC, RA, RD, RE, RB);		LK(RD, RB, RA, RE, RC, 7);
    523	S7(RD, RB, RA, RE, RC);		LK(RC, RA, RE, RD, RB, 8);
    524	S0(RC, RA, RE, RD, RB);		LK(RE, RA, RD, RC, RB, 9);
    525	S1(RE, RA, RD, RC, RB);		LK(RB, RD, RC, RE, RA, 10);
    526	S2(RB, RD, RC, RE, RA);		LK(RA, RD, RB, RE, RC, 11);
    527	S3(RA, RD, RB, RE, RC);		LK(RE, RC, RD, RA, RB, 12);
    528	S4(RE, RC, RD, RA, RB);		LK(RC, RD, RA, RB, RE, 13);
    529	S5(RC, RD, RA, RB, RE);		LK(RE, RC, RD, RB, RA, 14);
    530	S6(RE, RC, RD, RB, RA);		LK(RD, RA, RC, RB, RE, 15);
    531	S7(RD, RA, RC, RB, RE);		LK(RE, RC, RB, RD, RA, 16);
    532	S0(RE, RC, RB, RD, RA);		LK(RB, RC, RD, RE, RA, 17);
    533	S1(RB, RC, RD, RE, RA);		LK(RA, RD, RE, RB, RC, 18);
    534	S2(RA, RD, RE, RB, RC);		LK(RC, RD, RA, RB, RE, 19);
    535	S3(RC, RD, RA, RB, RE);		LK(RB, RE, RD, RC, RA, 20);
    536	S4(RB, RE, RD, RC, RA);		LK(RE, RD, RC, RA, RB, 21);
    537	S5(RE, RD, RC, RA, RB);		LK(RB, RE, RD, RA, RC, 22);
    538	S6(RB, RE, RD, RA, RC);		LK(RD, RC, RE, RA, RB, 23);
    539	S7(RD, RC, RE, RA, RB);		LK(RB, RE, RA, RD, RC, 24);
    540	S0(RB, RE, RA, RD, RC);		LK(RA, RE, RD, RB, RC, 25);
    541	S1(RA, RE, RD, RB, RC);		LK(RC, RD, RB, RA, RE, 26);
    542	S2(RC, RD, RB, RA, RE);		LK(RE, RD, RC, RA, RB, 27);
    543	S3(RE, RD, RC, RA, RB);		LK(RA, RB, RD, RE, RC, 28);
    544	S4(RA, RB, RD, RE, RC);		LK(RB, RD, RE, RC, RA, 29);
    545	S5(RB, RD, RE, RC, RA);		LK(RA, RB, RD, RC, RE, 30);
    546	S6(RA, RB, RD, RC, RE);		LK(RD, RE, RB, RC, RA, 31);
    547	S7(RD, RE, RB, RC, RA);		 K(RA, RB, RC, RD, RE, 32);
    548
    549	movl arg_dst(%esp), %eax;
    550
    551	cmpb $0, arg_xor(%esp);
    552	jnz .L__enc_xor4;
    553
    554	write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
    555
    556	RET;
    557
    558.L__enc_xor4:
    559	xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
    560
    561	RET;
    562SYM_FUNC_END(__serpent_enc_blk_4way)
    563
    564SYM_FUNC_START(serpent_dec_blk_4way)
    565	/* input:
    566	 *	arg_ctx(%esp): ctx, CTX
    567	 *	arg_dst(%esp): dst
    568	 *	arg_src(%esp): src
    569	 */
    570
    571	pcmpeqd RNOT, RNOT;
    572
    573	movl arg_ctx(%esp), CTX;
    574
    575	movl arg_src(%esp), %eax;
    576	read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
    577
    578					 K(RA, RB, RC, RD, RE, 32);
    579	SI7(RA, RB, RC, RD, RE);	KL(RB, RD, RA, RE, RC, 31);
    580	SI6(RB, RD, RA, RE, RC);	KL(RA, RC, RE, RB, RD, 30);
    581	SI5(RA, RC, RE, RB, RD);	KL(RC, RD, RA, RE, RB, 29);
    582	SI4(RC, RD, RA, RE, RB);	KL(RC, RA, RB, RE, RD, 28);
    583	SI3(RC, RA, RB, RE, RD);	KL(RB, RC, RD, RE, RA, 27);
    584	SI2(RB, RC, RD, RE, RA);	KL(RC, RA, RE, RD, RB, 26);
    585	SI1(RC, RA, RE, RD, RB);	KL(RB, RA, RE, RD, RC, 25);
    586	SI0(RB, RA, RE, RD, RC);	KL(RE, RC, RA, RB, RD, 24);
    587	SI7(RE, RC, RA, RB, RD);	KL(RC, RB, RE, RD, RA, 23);
    588	SI6(RC, RB, RE, RD, RA);	KL(RE, RA, RD, RC, RB, 22);
    589	SI5(RE, RA, RD, RC, RB);	KL(RA, RB, RE, RD, RC, 21);
    590	SI4(RA, RB, RE, RD, RC);	KL(RA, RE, RC, RD, RB, 20);
    591	SI3(RA, RE, RC, RD, RB);	KL(RC, RA, RB, RD, RE, 19);
    592	SI2(RC, RA, RB, RD, RE);	KL(RA, RE, RD, RB, RC, 18);
    593	SI1(RA, RE, RD, RB, RC);	KL(RC, RE, RD, RB, RA, 17);
    594	SI0(RC, RE, RD, RB, RA);	KL(RD, RA, RE, RC, RB, 16);
    595	SI7(RD, RA, RE, RC, RB);	KL(RA, RC, RD, RB, RE, 15);
    596	SI6(RA, RC, RD, RB, RE);	KL(RD, RE, RB, RA, RC, 14);
    597	SI5(RD, RE, RB, RA, RC);	KL(RE, RC, RD, RB, RA, 13);
    598	SI4(RE, RC, RD, RB, RA);	KL(RE, RD, RA, RB, RC, 12);
    599	SI3(RE, RD, RA, RB, RC);	KL(RA, RE, RC, RB, RD, 11);
    600	SI2(RA, RE, RC, RB, RD);	KL(RE, RD, RB, RC, RA, 10);
    601	SI1(RE, RD, RB, RC, RA);	KL(RA, RD, RB, RC, RE, 9);
    602	SI0(RA, RD, RB, RC, RE);	KL(RB, RE, RD, RA, RC, 8);
    603	SI7(RB, RE, RD, RA, RC);	KL(RE, RA, RB, RC, RD, 7);
    604	SI6(RE, RA, RB, RC, RD);	KL(RB, RD, RC, RE, RA, 6);
    605	SI5(RB, RD, RC, RE, RA);	KL(RD, RA, RB, RC, RE, 5);
    606	SI4(RD, RA, RB, RC, RE);	KL(RD, RB, RE, RC, RA, 4);
    607	SI3(RD, RB, RE, RC, RA);	KL(RE, RD, RA, RC, RB, 3);
    608	SI2(RE, RD, RA, RC, RB);	KL(RD, RB, RC, RA, RE, 2);
    609	SI1(RD, RB, RC, RA, RE);	KL(RE, RB, RC, RA, RD, 1);
    610	SI0(RE, RB, RC, RA, RD);	 K(RC, RD, RB, RE, RA, 0);
    611
    612	movl arg_dst(%esp), %eax;
    613	write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
    614
    615	RET;
    616SYM_FUNC_END(serpent_dec_blk_4way)