cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

serpent-sse2-x86_64-asm_64.S (18538B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
      4 *
      5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
      6 *
      7 * Based on crypto/serpent.c by
      8 *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
      9 *                2003 Herbert Valerio Riedel <hvr@gnu.org>
     10 */
     11
     12#include <linux/linkage.h>
     13
     14.file "serpent-sse2-x86_64-asm_64.S"
     15.text
     16
     17#define CTX %rdi
     18
     19/**********************************************************************
     20  8-way SSE2 serpent
     21 **********************************************************************/
     22#define RA1 %xmm0
     23#define RB1 %xmm1
     24#define RC1 %xmm2
     25#define RD1 %xmm3
     26#define RE1 %xmm4
     27
     28#define RA2 %xmm5
     29#define RB2 %xmm6
     30#define RC2 %xmm7
     31#define RD2 %xmm8
     32#define RE2 %xmm9
     33
     34#define RNOT %xmm10
     35
     36#define RK0 %xmm11
     37#define RK1 %xmm12
     38#define RK2 %xmm13
     39#define RK3 %xmm14
     40
     41#define S0_1(x0, x1, x2, x3, x4) \
     42	movdqa x3,		x4; \
     43	por x0,			x3; \
     44	pxor x4,		x0; \
     45	pxor x2,		x4; \
     46	pxor RNOT,		x4; \
     47	pxor x1,		x3; \
     48	pand x0,		x1; \
     49	pxor x4,		x1; \
     50	pxor x0,		x2;
     51#define S0_2(x0, x1, x2, x3, x4) \
     52	pxor x3,		x0; \
     53	por x0,			x4; \
     54	pxor x2,		x0; \
     55	pand x1,		x2; \
     56	pxor x2,		x3; \
     57	pxor RNOT,		x1; \
     58	pxor x4,		x2; \
     59	pxor x2,		x1;
     60
     61#define S1_1(x0, x1, x2, x3, x4) \
     62	movdqa x1,		x4; \
     63	pxor x0,		x1; \
     64	pxor x3,		x0; \
     65	pxor RNOT,		x3; \
     66	pand x1,		x4; \
     67	por x1,			x0; \
     68	pxor x2,		x3; \
     69	pxor x3,		x0; \
     70	pxor x3,		x1;
     71#define S1_2(x0, x1, x2, x3, x4) \
     72	pxor x4,		x3; \
     73	por x4,			x1; \
     74	pxor x2,		x4; \
     75	pand x0,		x2; \
     76	pxor x1,		x2; \
     77	por x0,			x1; \
     78	pxor RNOT,		x0; \
     79	pxor x2,		x0; \
     80	pxor x1,		x4;
     81
     82#define S2_1(x0, x1, x2, x3, x4) \
     83	pxor RNOT,		x3; \
     84	pxor x0,		x1; \
     85	movdqa x0,		x4; \
     86	pand x2,		x0; \
     87	pxor x3,		x0; \
     88	por x4,			x3; \
     89	pxor x1,		x2; \
     90	pxor x1,		x3; \
     91	pand x0,		x1;
     92#define S2_2(x0, x1, x2, x3, x4) \
     93	pxor x2,		x0; \
     94	pand x3,		x2; \
     95	por x1,			x3; \
     96	pxor RNOT,		x0; \
     97	pxor x0,		x3; \
     98	pxor x0,		x4; \
     99	pxor x2,		x0; \
    100	por x2,			x1;
    101
    102#define S3_1(x0, x1, x2, x3, x4) \
    103	movdqa x1,		x4; \
    104	pxor x3,		x1; \
    105	por x0,			x3; \
    106	pand x0,		x4; \
    107	pxor x2,		x0; \
    108	pxor x1,		x2; \
    109	pand x3,		x1; \
    110	pxor x3,		x2; \
    111	por x4,			x0; \
    112	pxor x3,		x4;
    113#define S3_2(x0, x1, x2, x3, x4) \
    114	pxor x0,		x1; \
    115	pand x3,		x0; \
    116	pand x4,		x3; \
    117	pxor x2,		x3; \
    118	por x1,			x4; \
    119	pand x1,		x2; \
    120	pxor x3,		x4; \
    121	pxor x3,		x0; \
    122	pxor x2,		x3;
    123
    124#define S4_1(x0, x1, x2, x3, x4) \
    125	movdqa x3,		x4; \
    126	pand x0,		x3; \
    127	pxor x4,		x0; \
    128	pxor x2,		x3; \
    129	por x4,			x2; \
    130	pxor x1,		x0; \
    131	pxor x3,		x4; \
    132	por x0,			x2; \
    133	pxor x1,		x2;
    134#define S4_2(x0, x1, x2, x3, x4) \
    135	pand x0,		x1; \
    136	pxor x4,		x1; \
    137	pand x2,		x4; \
    138	pxor x3,		x2; \
    139	pxor x0,		x4; \
    140	por x1,			x3; \
    141	pxor RNOT,		x1; \
    142	pxor x0,		x3;
    143
    144#define S5_1(x0, x1, x2, x3, x4) \
    145	movdqa x1,		x4; \
    146	por x0,			x1; \
    147	pxor x1,		x2; \
    148	pxor RNOT,		x3; \
    149	pxor x0,		x4; \
    150	pxor x2,		x0; \
    151	pand x4,		x1; \
    152	por x3,			x4; \
    153	pxor x0,		x4;
    154#define S5_2(x0, x1, x2, x3, x4) \
    155	pand x3,		x0; \
    156	pxor x3,		x1; \
    157	pxor x2,		x3; \
    158	pxor x1,		x0; \
    159	pand x4,		x2; \
    160	pxor x2,		x1; \
    161	pand x0,		x2; \
    162	pxor x2,		x3;
    163
    164#define S6_1(x0, x1, x2, x3, x4) \
    165	movdqa x1,		x4; \
    166	pxor x0,		x3; \
    167	pxor x2,		x1; \
    168	pxor x0,		x2; \
    169	pand x3,		x0; \
    170	por x3,			x1; \
    171	pxor RNOT,		x4; \
    172	pxor x1,		x0; \
    173	pxor x2,		x1;
    174#define S6_2(x0, x1, x2, x3, x4) \
    175	pxor x4,		x3; \
    176	pxor x0,		x4; \
    177	pand x0,		x2; \
    178	pxor x1,		x4; \
    179	pxor x3,		x2; \
    180	pand x1,		x3; \
    181	pxor x0,		x3; \
    182	pxor x2,		x1;
    183
    184#define S7_1(x0, x1, x2, x3, x4) \
    185	pxor RNOT,		x1; \
    186	movdqa x1,		x4; \
    187	pxor RNOT,		x0; \
    188	pand x2,		x1; \
    189	pxor x3,		x1; \
    190	por x4,			x3; \
    191	pxor x2,		x4; \
    192	pxor x3,		x2; \
    193	pxor x0,		x3; \
    194	por x1,			x0;
    195#define S7_2(x0, x1, x2, x3, x4) \
    196	pand x0,		x2; \
    197	pxor x4,		x0; \
    198	pxor x3,		x4; \
    199	pand x0,		x3; \
    200	pxor x1,		x4; \
    201	pxor x4,		x2; \
    202	pxor x1,		x3; \
    203	por x0,			x4; \
    204	pxor x1,		x4;
    205
    206#define SI0_1(x0, x1, x2, x3, x4) \
    207	movdqa x3,		x4; \
    208	pxor x0,		x1; \
    209	por x1,			x3; \
    210	pxor x1,		x4; \
    211	pxor RNOT,		x0; \
    212	pxor x3,		x2; \
    213	pxor x0,		x3; \
    214	pand x1,		x0; \
    215	pxor x2,		x0;
    216#define SI0_2(x0, x1, x2, x3, x4) \
    217	pand x3,		x2; \
    218	pxor x4,		x3; \
    219	pxor x3,		x2; \
    220	pxor x3,		x1; \
    221	pand x0,		x3; \
    222	pxor x0,		x1; \
    223	pxor x2,		x0; \
    224	pxor x3,		x4;
    225
    226#define SI1_1(x0, x1, x2, x3, x4) \
    227	pxor x3,		x1; \
    228	movdqa x0,		x4; \
    229	pxor x2,		x0; \
    230	pxor RNOT,		x2; \
    231	por x1,			x4; \
    232	pxor x3,		x4; \
    233	pand x1,		x3; \
    234	pxor x2,		x1; \
    235	pand x4,		x2;
    236#define SI1_2(x0, x1, x2, x3, x4) \
    237	pxor x1,		x4; \
    238	por x3,			x1; \
    239	pxor x0,		x3; \
    240	pxor x0,		x2; \
    241	por x4,			x0; \
    242	pxor x4,		x2; \
    243	pxor x0,		x1; \
    244	pxor x1,		x4;
    245
    246#define SI2_1(x0, x1, x2, x3, x4) \
    247	pxor x1,		x2; \
    248	movdqa x3,		x4; \
    249	pxor RNOT,		x3; \
    250	por x2,			x3; \
    251	pxor x4,		x2; \
    252	pxor x0,		x4; \
    253	pxor x1,		x3; \
    254	por x2,			x1; \
    255	pxor x0,		x2;
    256#define SI2_2(x0, x1, x2, x3, x4) \
    257	pxor x4,		x1; \
    258	por x3,			x4; \
    259	pxor x3,		x2; \
    260	pxor x2,		x4; \
    261	pand x1,		x2; \
    262	pxor x3,		x2; \
    263	pxor x4,		x3; \
    264	pxor x0,		x4;
    265
    266#define SI3_1(x0, x1, x2, x3, x4) \
    267	pxor x1,		x2; \
    268	movdqa x1,		x4; \
    269	pand x2,		x1; \
    270	pxor x0,		x1; \
    271	por x4,			x0; \
    272	pxor x3,		x4; \
    273	pxor x3,		x0; \
    274	por x1,			x3; \
    275	pxor x2,		x1;
    276#define SI3_2(x0, x1, x2, x3, x4) \
    277	pxor x3,		x1; \
    278	pxor x2,		x0; \
    279	pxor x3,		x2; \
    280	pand x1,		x3; \
    281	pxor x0,		x1; \
    282	pand x2,		x0; \
    283	pxor x3,		x4; \
    284	pxor x0,		x3; \
    285	pxor x1,		x0;
    286
    287#define SI4_1(x0, x1, x2, x3, x4) \
    288	pxor x3,		x2; \
    289	movdqa x0,		x4; \
    290	pand x1,		x0; \
    291	pxor x2,		x0; \
    292	por x3,			x2; \
    293	pxor RNOT,		x4; \
    294	pxor x0,		x1; \
    295	pxor x2,		x0; \
    296	pand x4,		x2;
    297#define SI4_2(x0, x1, x2, x3, x4) \
    298	pxor x0,		x2; \
    299	por x4,			x0; \
    300	pxor x3,		x0; \
    301	pand x2,		x3; \
    302	pxor x3,		x4; \
    303	pxor x1,		x3; \
    304	pand x0,		x1; \
    305	pxor x1,		x4; \
    306	pxor x3,		x0;
    307
    308#define SI5_1(x0, x1, x2, x3, x4) \
    309	movdqa x1,		x4; \
    310	por x2,			x1; \
    311	pxor x4,		x2; \
    312	pxor x3,		x1; \
    313	pand x4,		x3; \
    314	pxor x3,		x2; \
    315	por x0,			x3; \
    316	pxor RNOT,		x0; \
    317	pxor x2,		x3; \
    318	por x0,			x2;
    319#define SI5_2(x0, x1, x2, x3, x4) \
    320	pxor x1,		x4; \
    321	pxor x4,		x2; \
    322	pand x0,		x4; \
    323	pxor x1,		x0; \
    324	pxor x3,		x1; \
    325	pand x2,		x0; \
    326	pxor x3,		x2; \
    327	pxor x2,		x0; \
    328	pxor x4,		x2; \
    329	pxor x3,		x4;
    330
    331#define SI6_1(x0, x1, x2, x3, x4) \
    332	pxor x2,		x0; \
    333	movdqa x0,		x4; \
    334	pand x3,		x0; \
    335	pxor x3,		x2; \
    336	pxor x2,		x0; \
    337	pxor x1,		x3; \
    338	por x4,			x2; \
    339	pxor x3,		x2; \
    340	pand x0,		x3;
    341#define SI6_2(x0, x1, x2, x3, x4) \
    342	pxor RNOT,		x0; \
    343	pxor x1,		x3; \
    344	pand x2,		x1; \
    345	pxor x0,		x4; \
    346	pxor x4,		x3; \
    347	pxor x2,		x4; \
    348	pxor x1,		x0; \
    349	pxor x0,		x2;
    350
    351#define SI7_1(x0, x1, x2, x3, x4) \
    352	movdqa x3,		x4; \
    353	pand x0,		x3; \
    354	pxor x2,		x0; \
    355	por x4,			x2; \
    356	pxor x1,		x4; \
    357	pxor RNOT,		x0; \
    358	por x3,			x1; \
    359	pxor x0,		x4; \
    360	pand x2,		x0; \
    361	pxor x1,		x0;
    362#define SI7_2(x0, x1, x2, x3, x4) \
    363	pand x2,		x1; \
    364	pxor x2,		x3; \
    365	pxor x3,		x4; \
    366	pand x3,		x2; \
    367	por x0,			x3; \
    368	pxor x4,		x1; \
    369	pxor x4,		x3; \
    370	pand x0,		x4; \
    371	pxor x2,		x4;
    372
    373#define get_key(i, j, t) \
    374	movd (4*(i)+(j))*4(CTX), t; \
    375	pshufd $0, t, t;
    376
    377#define K2(x0, x1, x2, x3, x4, i) \
    378	get_key(i, 0, RK0); \
    379	get_key(i, 1, RK1); \
    380	get_key(i, 2, RK2); \
    381	get_key(i, 3, RK3); \
    382	pxor RK0,		x0 ## 1; \
    383	pxor RK1,		x1 ## 1; \
    384	pxor RK2,		x2 ## 1; \
    385	pxor RK3,		x3 ## 1; \
    386		pxor RK0,		x0 ## 2; \
    387		pxor RK1,		x1 ## 2; \
    388		pxor RK2,		x2 ## 2; \
    389		pxor RK3,		x3 ## 2;
    390
    391#define LK2(x0, x1, x2, x3, x4, i) \
    392	movdqa x0 ## 1,		x4 ## 1; \
    393	pslld $13,		x0 ## 1; \
    394	psrld $(32 - 13),	x4 ## 1; \
    395	por x4 ## 1,		x0 ## 1; \
    396	pxor x0 ## 1,		x1 ## 1; \
    397	movdqa x2 ## 1,		x4 ## 1; \
    398	pslld $3,		x2 ## 1; \
    399	psrld $(32 - 3),	x4 ## 1; \
    400	por x4 ## 1,		x2 ## 1; \
    401	pxor x2 ## 1,		x1 ## 1; \
    402		movdqa x0 ## 2,		x4 ## 2; \
    403		pslld $13,		x0 ## 2; \
    404		psrld $(32 - 13),	x4 ## 2; \
    405		por x4 ## 2,		x0 ## 2; \
    406		pxor x0 ## 2,		x1 ## 2; \
    407		movdqa x2 ## 2,		x4 ## 2; \
    408		pslld $3,		x2 ## 2; \
    409		psrld $(32 - 3),	x4 ## 2; \
    410		por x4 ## 2,		x2 ## 2; \
    411		pxor x2 ## 2,		x1 ## 2; \
    412	movdqa x1 ## 1,		x4 ## 1; \
    413	pslld $1,		x1 ## 1; \
    414	psrld $(32 - 1),	x4 ## 1; \
    415	por x4 ## 1,		x1 ## 1; \
    416	movdqa x0 ## 1,		x4 ## 1; \
    417	pslld $3,		x4 ## 1; \
    418	pxor x2 ## 1,		x3 ## 1; \
    419	pxor x4 ## 1,		x3 ## 1; \
    420	movdqa x3 ## 1,		x4 ## 1; \
    421	get_key(i, 1, RK1); \
    422		movdqa x1 ## 2,		x4 ## 2; \
    423		pslld $1,		x1 ## 2; \
    424		psrld $(32 - 1),	x4 ## 2; \
    425		por x4 ## 2,		x1 ## 2; \
    426		movdqa x0 ## 2,		x4 ## 2; \
    427		pslld $3,		x4 ## 2; \
    428		pxor x2 ## 2,		x3 ## 2; \
    429		pxor x4 ## 2,		x3 ## 2; \
    430		movdqa x3 ## 2,		x4 ## 2; \
    431		get_key(i, 3, RK3); \
    432	pslld $7,		x3 ## 1; \
    433	psrld $(32 - 7),	x4 ## 1; \
    434	por x4 ## 1,		x3 ## 1; \
    435	movdqa x1 ## 1,		x4 ## 1; \
    436	pslld $7,		x4 ## 1; \
    437	pxor x1 ## 1,		x0 ## 1; \
    438	pxor x3 ## 1,		x0 ## 1; \
    439	pxor x3 ## 1,		x2 ## 1; \
    440	pxor x4 ## 1,		x2 ## 1; \
    441	get_key(i, 0, RK0); \
    442		pslld $7,		x3 ## 2; \
    443		psrld $(32 - 7),	x4 ## 2; \
    444		por x4 ## 2,		x3 ## 2; \
    445		movdqa x1 ## 2,		x4 ## 2; \
    446		pslld $7,		x4 ## 2; \
    447		pxor x1 ## 2,		x0 ## 2; \
    448		pxor x3 ## 2,		x0 ## 2; \
    449		pxor x3 ## 2,		x2 ## 2; \
    450		pxor x4 ## 2,		x2 ## 2; \
    451		get_key(i, 2, RK2); \
    452	pxor RK1,		x1 ## 1; \
    453	pxor RK3,		x3 ## 1; \
    454	movdqa x0 ## 1,		x4 ## 1; \
    455	pslld $5,		x0 ## 1; \
    456	psrld $(32 - 5),	x4 ## 1; \
    457	por x4 ## 1,		x0 ## 1; \
    458	movdqa x2 ## 1,		x4 ## 1; \
    459	pslld $22,		x2 ## 1; \
    460	psrld $(32 - 22),	x4 ## 1; \
    461	por x4 ## 1,		x2 ## 1; \
    462	pxor RK0,		x0 ## 1; \
    463	pxor RK2,		x2 ## 1; \
    464		pxor RK1,		x1 ## 2; \
    465		pxor RK3,		x3 ## 2; \
    466		movdqa x0 ## 2,		x4 ## 2; \
    467		pslld $5,		x0 ## 2; \
    468		psrld $(32 - 5),	x4 ## 2; \
    469		por x4 ## 2,		x0 ## 2; \
    470		movdqa x2 ## 2,		x4 ## 2; \
    471		pslld $22,		x2 ## 2; \
    472		psrld $(32 - 22),	x4 ## 2; \
    473		por x4 ## 2,		x2 ## 2; \
    474		pxor RK0,		x0 ## 2; \
    475		pxor RK2,		x2 ## 2;
    476
    477#define KL2(x0, x1, x2, x3, x4, i) \
    478	pxor RK0,		x0 ## 1; \
    479	pxor RK2,		x2 ## 1; \
    480	movdqa x0 ## 1,		x4 ## 1; \
    481	psrld $5,		x0 ## 1; \
    482	pslld $(32 - 5),	x4 ## 1; \
    483	por x4 ## 1,		x0 ## 1; \
    484	pxor RK3,		x3 ## 1; \
    485	pxor RK1,		x1 ## 1; \
    486	movdqa x2 ## 1,		x4 ## 1; \
    487	psrld $22,		x2 ## 1; \
    488	pslld $(32 - 22),	x4 ## 1; \
    489	por x4 ## 1,		x2 ## 1; \
    490	pxor x3 ## 1,		x2 ## 1; \
    491		pxor RK0,		x0 ## 2; \
    492		pxor RK2,		x2 ## 2; \
    493		movdqa x0 ## 2,		x4 ## 2; \
    494		psrld $5,		x0 ## 2; \
    495		pslld $(32 - 5),	x4 ## 2; \
    496		por x4 ## 2,		x0 ## 2; \
    497		pxor RK3,		x3 ## 2; \
    498		pxor RK1,		x1 ## 2; \
    499		movdqa x2 ## 2,		x4 ## 2; \
    500		psrld $22,		x2 ## 2; \
    501		pslld $(32 - 22),	x4 ## 2; \
    502		por x4 ## 2,		x2 ## 2; \
    503		pxor x3 ## 2,		x2 ## 2; \
    504	pxor x3 ## 1,		x0 ## 1; \
    505	movdqa x1 ## 1,		x4 ## 1; \
    506	pslld $7,		x4 ## 1; \
    507	pxor x1 ## 1,		x0 ## 1; \
    508	pxor x4 ## 1,		x2 ## 1; \
    509	movdqa x1 ## 1,		x4 ## 1; \
    510	psrld $1,		x1 ## 1; \
    511	pslld $(32 - 1),	x4 ## 1; \
    512	por x4 ## 1,		x1 ## 1; \
    513		pxor x3 ## 2,		x0 ## 2; \
    514		movdqa x1 ## 2,		x4 ## 2; \
    515		pslld $7,		x4 ## 2; \
    516		pxor x1 ## 2,		x0 ## 2; \
    517		pxor x4 ## 2,		x2 ## 2; \
    518		movdqa x1 ## 2,		x4 ## 2; \
    519		psrld $1,		x1 ## 2; \
    520		pslld $(32 - 1),	x4 ## 2; \
    521		por x4 ## 2,		x1 ## 2; \
    522	movdqa x3 ## 1,		x4 ## 1; \
    523	psrld $7,		x3 ## 1; \
    524	pslld $(32 - 7),	x4 ## 1; \
    525	por x4 ## 1,		x3 ## 1; \
    526	pxor x0 ## 1,		x1 ## 1; \
    527	movdqa x0 ## 1,		x4 ## 1; \
    528	pslld $3,		x4 ## 1; \
    529	pxor x4 ## 1,		x3 ## 1; \
    530	movdqa x0 ## 1,		x4 ## 1; \
    531		movdqa x3 ## 2,		x4 ## 2; \
    532		psrld $7,		x3 ## 2; \
    533		pslld $(32 - 7),	x4 ## 2; \
    534		por x4 ## 2,		x3 ## 2; \
    535		pxor x0 ## 2,		x1 ## 2; \
    536		movdqa x0 ## 2,		x4 ## 2; \
    537		pslld $3,		x4 ## 2; \
    538		pxor x4 ## 2,		x3 ## 2; \
    539		movdqa x0 ## 2,		x4 ## 2; \
    540	psrld $13,		x0 ## 1; \
    541	pslld $(32 - 13),	x4 ## 1; \
    542	por x4 ## 1,		x0 ## 1; \
    543	pxor x2 ## 1,		x1 ## 1; \
    544	pxor x2 ## 1,		x3 ## 1; \
    545	movdqa x2 ## 1,		x4 ## 1; \
    546	psrld $3,		x2 ## 1; \
    547	pslld $(32 - 3),	x4 ## 1; \
    548	por x4 ## 1,		x2 ## 1; \
    549		psrld $13,		x0 ## 2; \
    550		pslld $(32 - 13),	x4 ## 2; \
    551		por x4 ## 2,		x0 ## 2; \
    552		pxor x2 ## 2,		x1 ## 2; \
    553		pxor x2 ## 2,		x3 ## 2; \
    554		movdqa x2 ## 2,		x4 ## 2; \
    555		psrld $3,		x2 ## 2; \
    556		pslld $(32 - 3),	x4 ## 2; \
    557		por x4 ## 2,		x2 ## 2;
    558
    559#define S(SBOX, x0, x1, x2, x3, x4) \
    560	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
    561	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
    562	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
    563	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
    564
    565#define SP(SBOX, x0, x1, x2, x3, x4, i) \
    566	get_key(i, 0, RK0); \
    567	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
    568	get_key(i, 2, RK2); \
    569	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
    570	get_key(i, 3, RK3); \
    571	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
    572	get_key(i, 1, RK1); \
    573	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
    574
    575#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
    576	movdqa x0,		t2; \
    577	punpckldq x1,		x0; \
    578	punpckhdq x1,		t2; \
    579	movdqa x2,		t1; \
    580	punpckhdq x3,		x2; \
    581	punpckldq x3,		t1; \
    582	movdqa x0,		x1; \
    583	punpcklqdq t1,		x0; \
    584	punpckhqdq t1,		x1; \
    585	movdqa t2,		x3; \
    586	punpcklqdq x2,		t2; \
    587	punpckhqdq x2,		x3; \
    588	movdqa t2,		x2;
    589
    590#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
    591	movdqu (0*4*4)(in),	x0; \
    592	movdqu (1*4*4)(in),	x1; \
    593	movdqu (2*4*4)(in),	x2; \
    594	movdqu (3*4*4)(in),	x3; \
    595	\
    596	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
    597
    598#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
    599	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
    600	\
    601	movdqu x0,		(0*4*4)(out); \
    602	movdqu x1,		(1*4*4)(out); \
    603	movdqu x2,		(2*4*4)(out); \
    604	movdqu x3,		(3*4*4)(out);
    605
    606#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
    607	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
    608	\
    609	movdqu (0*4*4)(out),	t0; \
    610	pxor t0,		x0; \
    611	movdqu x0,		(0*4*4)(out); \
    612	movdqu (1*4*4)(out),	t0; \
    613	pxor t0,		x1; \
    614	movdqu x1,		(1*4*4)(out); \
    615	movdqu (2*4*4)(out),	t0; \
    616	pxor t0,		x2; \
    617	movdqu x2,		(2*4*4)(out); \
    618	movdqu (3*4*4)(out),	t0; \
    619	pxor t0,		x3; \
    620	movdqu x3,		(3*4*4)(out);
    621
    622SYM_FUNC_START(__serpent_enc_blk_8way)
    623	/* input:
    624	 *	%rdi: ctx, CTX
    625	 *	%rsi: dst
    626	 *	%rdx: src
    627	 *	%rcx: bool, if true: xor output
    628	 */
    629
    630	pcmpeqd RNOT, RNOT;
    631
    632	leaq (4*4*4)(%rdx), %rax;
    633	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
    634	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
    635
    636						 K2(RA, RB, RC, RD, RE, 0);
    637	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
    638	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2);
    639	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3);
    640	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4);
    641	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5);
    642	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6);
    643	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7);
    644	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8);
    645	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9);
    646	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10);
    647	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11);
    648	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12);
    649	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13);
    650	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14);
    651	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15);
    652	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16);
    653	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17);
    654	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18);
    655	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19);
    656	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20);
    657	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21);
    658	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22);
    659	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23);
    660	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24);
    661	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25);
    662	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26);
    663	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27);
    664	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28);
    665	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29);
    666	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30);
    667	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
    668	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
    669
    670	leaq (4*4*4)(%rsi), %rax;
    671
    672	testb %cl, %cl;
    673	jnz .L__enc_xor8;
    674
    675	write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
    676	write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
    677
    678	RET;
    679
    680.L__enc_xor8:
    681	xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
    682	xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
    683
    684	RET;
    685SYM_FUNC_END(__serpent_enc_blk_8way)
    686
    687SYM_FUNC_START(serpent_dec_blk_8way)
    688	/* input:
    689	 *	%rdi: ctx, CTX
    690	 *	%rsi: dst
    691	 *	%rdx: src
    692	 */
    693
    694	pcmpeqd RNOT, RNOT;
    695
    696	leaq (4*4*4)(%rdx), %rax;
    697	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
    698	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
    699
    700						 K2(RA, RB, RC, RD, RE, 32);
    701	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
    702	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30);
    703	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29);
    704	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28);
    705	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27);
    706	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26);
    707	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25);
    708	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24);
    709	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23);
    710	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22);
    711	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21);
    712	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20);
    713	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19);
    714	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18);
    715	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17);
    716	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16);
    717	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15);
    718	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14);
    719	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13);
    720	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12);
    721	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11);
    722	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10);
    723	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9);
    724	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8);
    725	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7);
    726	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6);
    727	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5);
    728	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4);
    729	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3);
    730	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2);
    731	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
    732	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
    733
    734	leaq (4*4*4)(%rsi), %rax;
    735	write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
    736	write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
    737
    738	RET;
    739SYM_FUNC_END(serpent_dec_blk_8way)