cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

serpent-avx2-asm_64.S (21583B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * x86_64/AVX2 assembler optimized version of Serpent
      4 *
      5 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
      6 *
      7 * Based on AVX assembler implementation of Serpent by:
      8 *  Copyright © 2012 Johannes Goetzfried
      9 *      <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
     10 */
     11
     12#include <linux/linkage.h>
     13#include <asm/frame.h>
     14#include "glue_helper-asm-avx2.S"
     15
     16.file "serpent-avx2-asm_64.S"
     17
     18.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
     19.align 16
     20.Lbswap128_mask:
     21	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
     22
     23.text
     24
     25#define CTX %rdi
     26
     27#define RNOT %ymm0
     28#define tp  %ymm1
     29
     30#define RA1 %ymm2
     31#define RA2 %ymm3
     32#define RB1 %ymm4
     33#define RB2 %ymm5
     34#define RC1 %ymm6
     35#define RC2 %ymm7
     36#define RD1 %ymm8
     37#define RD2 %ymm9
     38#define RE1 %ymm10
     39#define RE2 %ymm11
     40
     41#define RK0 %ymm12
     42#define RK1 %ymm13
     43#define RK2 %ymm14
     44#define RK3 %ymm15
     45
     46#define RK0x %xmm12
     47#define RK1x %xmm13
     48#define RK2x %xmm14
     49#define RK3x %xmm15
     50
     51#define S0_1(x0, x1, x2, x3, x4)      \
     52	vpor		x0,   x3, tp; \
     53	vpxor		x3,   x0, x0; \
     54	vpxor		x2,   x3, x4; \
     55	vpxor		RNOT, x4, x4; \
     56	vpxor		x1,   tp, x3; \
     57	vpand		x0,   x1, x1; \
     58	vpxor		x4,   x1, x1; \
     59	vpxor		x0,   x2, x2;
     60#define S0_2(x0, x1, x2, x3, x4)      \
     61	vpxor		x3,   x0, x0; \
     62	vpor		x0,   x4, x4; \
     63	vpxor		x2,   x0, x0; \
     64	vpand		x1,   x2, x2; \
     65	vpxor		x2,   x3, x3; \
     66	vpxor		RNOT, x1, x1; \
     67	vpxor		x4,   x2, x2; \
     68	vpxor		x2,   x1, x1;
     69
     70#define S1_1(x0, x1, x2, x3, x4)      \
     71	vpxor		x0,   x1, tp; \
     72	vpxor		x3,   x0, x0; \
     73	vpxor		RNOT, x3, x3; \
     74	vpand		tp,   x1, x4; \
     75	vpor		tp,   x0, x0; \
     76	vpxor		x2,   x3, x3; \
     77	vpxor		x3,   x0, x0; \
     78	vpxor		x3,   tp, x1;
     79#define S1_2(x0, x1, x2, x3, x4)      \
     80	vpxor		x4,   x3, x3; \
     81	vpor		x4,   x1, x1; \
     82	vpxor		x2,   x4, x4; \
     83	vpand		x0,   x2, x2; \
     84	vpxor		x1,   x2, x2; \
     85	vpor		x0,   x1, x1; \
     86	vpxor		RNOT, x0, x0; \
     87	vpxor		x2,   x0, x0; \
     88	vpxor		x1,   x4, x4;
     89
     90#define S2_1(x0, x1, x2, x3, x4)      \
     91	vpxor		RNOT, x3, x3; \
     92	vpxor		x0,   x1, x1; \
     93	vpand		x2,   x0, tp; \
     94	vpxor		x3,   tp, tp; \
     95	vpor		x0,   x3, x3; \
     96	vpxor		x1,   x2, x2; \
     97	vpxor		x1,   x3, x3; \
     98	vpand		tp,   x1, x1;
     99#define S2_2(x0, x1, x2, x3, x4)      \
    100	vpxor		x2,   tp, tp; \
    101	vpand		x3,   x2, x2; \
    102	vpor		x1,   x3, x3; \
    103	vpxor		RNOT, tp, tp; \
    104	vpxor		tp,   x3, x3; \
    105	vpxor		tp,   x0, x4; \
    106	vpxor		x2,   tp, x0; \
    107	vpor		x2,   x1, x1;
    108
    109#define S3_1(x0, x1, x2, x3, x4)      \
    110	vpxor		x3,   x1, tp; \
    111	vpor		x0,   x3, x3; \
    112	vpand		x0,   x1, x4; \
    113	vpxor		x2,   x0, x0; \
    114	vpxor		tp,   x2, x2; \
    115	vpand		x3,   tp, x1; \
    116	vpxor		x3,   x2, x2; \
    117	vpor		x4,   x0, x0; \
    118	vpxor		x3,   x4, x4;
    119#define S3_2(x0, x1, x2, x3, x4)      \
    120	vpxor		x0,   x1, x1; \
    121	vpand		x3,   x0, x0; \
    122	vpand		x4,   x3, x3; \
    123	vpxor		x2,   x3, x3; \
    124	vpor		x1,   x4, x4; \
    125	vpand		x1,   x2, x2; \
    126	vpxor		x3,   x4, x4; \
    127	vpxor		x3,   x0, x0; \
    128	vpxor		x2,   x3, x3;
    129
    130#define S4_1(x0, x1, x2, x3, x4)      \
    131	vpand		x0,   x3, tp; \
    132	vpxor		x3,   x0, x0; \
    133	vpxor		x2,   tp, tp; \
    134	vpor		x3,   x2, x2; \
    135	vpxor		x1,   x0, x0; \
    136	vpxor		tp,   x3, x4; \
    137	vpor		x0,   x2, x2; \
    138	vpxor		x1,   x2, x2;
    139#define S4_2(x0, x1, x2, x3, x4)      \
    140	vpand		x0,   x1, x1; \
    141	vpxor		x4,   x1, x1; \
    142	vpand		x2,   x4, x4; \
    143	vpxor		tp,   x2, x2; \
    144	vpxor		x0,   x4, x4; \
    145	vpor		x1,   tp, x3; \
    146	vpxor		RNOT, x1, x1; \
    147	vpxor		x0,   x3, x3;
    148
    149#define S5_1(x0, x1, x2, x3, x4)      \
    150	vpor		x0,   x1, tp; \
    151	vpxor		tp,   x2, x2; \
    152	vpxor		RNOT, x3, x3; \
    153	vpxor		x0,   x1, x4; \
    154	vpxor		x2,   x0, x0; \
    155	vpand		x4,   tp, x1; \
    156	vpor		x3,   x4, x4; \
    157	vpxor		x0,   x4, x4;
    158#define S5_2(x0, x1, x2, x3, x4)      \
    159	vpand		x3,   x0, x0; \
    160	vpxor		x3,   x1, x1; \
    161	vpxor		x2,   x3, x3; \
    162	vpxor		x1,   x0, x0; \
    163	vpand		x4,   x2, x2; \
    164	vpxor		x2,   x1, x1; \
    165	vpand		x0,   x2, x2; \
    166	vpxor		x2,   x3, x3;
    167
    168#define S6_1(x0, x1, x2, x3, x4)      \
    169	vpxor		x0,   x3, x3; \
    170	vpxor		x2,   x1, tp; \
    171	vpxor		x0,   x2, x2; \
    172	vpand		x3,   x0, x0; \
    173	vpor		x3,   tp, tp; \
    174	vpxor		RNOT, x1, x4; \
    175	vpxor		tp,   x0, x0; \
    176	vpxor		x2,   tp, x1;
    177#define S6_2(x0, x1, x2, x3, x4)      \
    178	vpxor		x4,   x3, x3; \
    179	vpxor		x0,   x4, x4; \
    180	vpand		x0,   x2, x2; \
    181	vpxor		x1,   x4, x4; \
    182	vpxor		x3,   x2, x2; \
    183	vpand		x1,   x3, x3; \
    184	vpxor		x0,   x3, x3; \
    185	vpxor		x2,   x1, x1;
    186
    187#define S7_1(x0, x1, x2, x3, x4)      \
    188	vpxor		RNOT, x1, tp; \
    189	vpxor		RNOT, x0, x0; \
    190	vpand		x2,   tp, x1; \
    191	vpxor		x3,   x1, x1; \
    192	vpor		tp,   x3, x3; \
    193	vpxor		x2,   tp, x4; \
    194	vpxor		x3,   x2, x2; \
    195	vpxor		x0,   x3, x3; \
    196	vpor		x1,   x0, x0;
    197#define S7_2(x0, x1, x2, x3, x4)      \
    198	vpand		x0,   x2, x2; \
    199	vpxor		x4,   x0, x0; \
    200	vpxor		x3,   x4, x4; \
    201	vpand		x0,   x3, x3; \
    202	vpxor		x1,   x4, x4; \
    203	vpxor		x4,   x2, x2; \
    204	vpxor		x1,   x3, x3; \
    205	vpor		x0,   x4, x4; \
    206	vpxor		x1,   x4, x4;
    207
    208#define SI0_1(x0, x1, x2, x3, x4)     \
    209	vpxor		x0,   x1, x1; \
    210	vpor		x1,   x3, tp; \
    211	vpxor		x1,   x3, x4; \
    212	vpxor		RNOT, x0, x0; \
    213	vpxor		tp,   x2, x2; \
    214	vpxor		x0,   tp, x3; \
    215	vpand		x1,   x0, x0; \
    216	vpxor		x2,   x0, x0;
    217#define SI0_2(x0, x1, x2, x3, x4)     \
    218	vpand		x3,   x2, x2; \
    219	vpxor		x4,   x3, x3; \
    220	vpxor		x3,   x2, x2; \
    221	vpxor		x3,   x1, x1; \
    222	vpand		x0,   x3, x3; \
    223	vpxor		x0,   x1, x1; \
    224	vpxor		x2,   x0, x0; \
    225	vpxor		x3,   x4, x4;
    226
    227#define SI1_1(x0, x1, x2, x3, x4)     \
    228	vpxor		x3,   x1, x1; \
    229	vpxor		x2,   x0, tp; \
    230	vpxor		RNOT, x2, x2; \
    231	vpor		x1,   x0, x4; \
    232	vpxor		x3,   x4, x4; \
    233	vpand		x1,   x3, x3; \
    234	vpxor		x2,   x1, x1; \
    235	vpand		x4,   x2, x2;
    236#define SI1_2(x0, x1, x2, x3, x4)     \
    237	vpxor		x1,   x4, x4; \
    238	vpor		x3,   x1, x1; \
    239	vpxor		tp,   x3, x3; \
    240	vpxor		tp,   x2, x2; \
    241	vpor		x4,   tp, x0; \
    242	vpxor		x4,   x2, x2; \
    243	vpxor		x0,   x1, x1; \
    244	vpxor		x1,   x4, x4;
    245
    246#define SI2_1(x0, x1, x2, x3, x4)     \
    247	vpxor		x1,   x2, x2; \
    248	vpxor		RNOT, x3, tp; \
    249	vpor		x2,   tp, tp; \
    250	vpxor		x3,   x2, x2; \
    251	vpxor		x0,   x3, x4; \
    252	vpxor		x1,   tp, x3; \
    253	vpor		x2,   x1, x1; \
    254	vpxor		x0,   x2, x2;
    255#define SI2_2(x0, x1, x2, x3, x4)     \
    256	vpxor		x4,   x1, x1; \
    257	vpor		x3,   x4, x4; \
    258	vpxor		x3,   x2, x2; \
    259	vpxor		x2,   x4, x4; \
    260	vpand		x1,   x2, x2; \
    261	vpxor		x3,   x2, x2; \
    262	vpxor		x4,   x3, x3; \
    263	vpxor		x0,   x4, x4;
    264
    265#define SI3_1(x0, x1, x2, x3, x4)     \
    266	vpxor		x1,   x2, x2; \
    267	vpand		x2,   x1, tp; \
    268	vpxor		x0,   tp, tp; \
    269	vpor		x1,   x0, x0; \
    270	vpxor		x3,   x1, x4; \
    271	vpxor		x3,   x0, x0; \
    272	vpor		tp,   x3, x3; \
    273	vpxor		x2,   tp, x1;
    274#define SI3_2(x0, x1, x2, x3, x4)     \
    275	vpxor		x3,   x1, x1; \
    276	vpxor		x2,   x0, x0; \
    277	vpxor		x3,   x2, x2; \
    278	vpand		x1,   x3, x3; \
    279	vpxor		x0,   x1, x1; \
    280	vpand		x2,   x0, x0; \
    281	vpxor		x3,   x4, x4; \
    282	vpxor		x0,   x3, x3; \
    283	vpxor		x1,   x0, x0;
    284
    285#define SI4_1(x0, x1, x2, x3, x4)     \
    286	vpxor		x3,   x2, x2; \
    287	vpand		x1,   x0, tp; \
    288	vpxor		x2,   tp, tp; \
    289	vpor		x3,   x2, x2; \
    290	vpxor		RNOT, x0, x4; \
    291	vpxor		tp,   x1, x1; \
    292	vpxor		x2,   tp, x0; \
    293	vpand		x4,   x2, x2;
    294#define SI4_2(x0, x1, x2, x3, x4)     \
    295	vpxor		x0,   x2, x2; \
    296	vpor		x4,   x0, x0; \
    297	vpxor		x3,   x0, x0; \
    298	vpand		x2,   x3, x3; \
    299	vpxor		x3,   x4, x4; \
    300	vpxor		x1,   x3, x3; \
    301	vpand		x0,   x1, x1; \
    302	vpxor		x1,   x4, x4; \
    303	vpxor		x3,   x0, x0;
    304
    305#define SI5_1(x0, x1, x2, x3, x4)     \
    306	vpor		x2,   x1, tp; \
    307	vpxor		x1,   x2, x2; \
    308	vpxor		x3,   tp, tp; \
    309	vpand		x1,   x3, x3; \
    310	vpxor		x3,   x2, x2; \
    311	vpor		x0,   x3, x3; \
    312	vpxor		RNOT, x0, x0; \
    313	vpxor		x2,   x3, x3; \
    314	vpor		x0,   x2, x2;
    315#define SI5_2(x0, x1, x2, x3, x4)     \
    316	vpxor		tp,   x1, x4; \
    317	vpxor		x4,   x2, x2; \
    318	vpand		x0,   x4, x4; \
    319	vpxor		tp,   x0, x0; \
    320	vpxor		x3,   tp, x1; \
    321	vpand		x2,   x0, x0; \
    322	vpxor		x3,   x2, x2; \
    323	vpxor		x2,   x0, x0; \
    324	vpxor		x4,   x2, x2; \
    325	vpxor		x3,   x4, x4;
    326
    327#define SI6_1(x0, x1, x2, x3, x4)     \
    328	vpxor		x2,   x0, x0; \
    329	vpand		x3,   x0, tp; \
    330	vpxor		x3,   x2, x2; \
    331	vpxor		x2,   tp, tp; \
    332	vpxor		x1,   x3, x3; \
    333	vpor		x0,   x2, x2; \
    334	vpxor		x3,   x2, x2; \
    335	vpand		tp,   x3, x3;
    336#define SI6_2(x0, x1, x2, x3, x4)     \
    337	vpxor		RNOT, tp, tp; \
    338	vpxor		x1,   x3, x3; \
    339	vpand		x2,   x1, x1; \
    340	vpxor		tp,   x0, x4; \
    341	vpxor		x4,   x3, x3; \
    342	vpxor		x2,   x4, x4; \
    343	vpxor		x1,   tp, x0; \
    344	vpxor		x0,   x2, x2;
    345
    346#define SI7_1(x0, x1, x2, x3, x4)     \
    347	vpand		x0,   x3, tp; \
    348	vpxor		x2,   x0, x0; \
    349	vpor		x3,   x2, x2; \
    350	vpxor		x1,   x3, x4; \
    351	vpxor		RNOT, x0, x0; \
    352	vpor		tp,   x1, x1; \
    353	vpxor		x0,   x4, x4; \
    354	vpand		x2,   x0, x0; \
    355	vpxor		x1,   x0, x0;
    356#define SI7_2(x0, x1, x2, x3, x4)     \
    357	vpand		x2,   x1, x1; \
    358	vpxor		x2,   tp, x3; \
    359	vpxor		x3,   x4, x4; \
    360	vpand		x3,   x2, x2; \
    361	vpor		x0,   x3, x3; \
    362	vpxor		x4,   x1, x1; \
    363	vpxor		x4,   x3, x3; \
    364	vpand		x0,   x4, x4; \
    365	vpxor		x2,   x4, x4;
    366
    367#define get_key(i,j,t) \
    368	vpbroadcastd (4*(i)+(j))*4(CTX), t;
    369
    370#define K2(x0, x1, x2, x3, x4, i) \
    371	get_key(i, 0, RK0); \
    372	get_key(i, 1, RK1); \
    373	get_key(i, 2, RK2); \
    374	get_key(i, 3, RK3); \
    375	vpxor RK0,	x0 ## 1, x0 ## 1; \
    376	vpxor RK1,	x1 ## 1, x1 ## 1; \
    377	vpxor RK2,	x2 ## 1, x2 ## 1; \
    378	vpxor RK3,	x3 ## 1, x3 ## 1; \
    379		vpxor RK0,	x0 ## 2, x0 ## 2; \
    380		vpxor RK1,	x1 ## 2, x1 ## 2; \
    381		vpxor RK2,	x2 ## 2, x2 ## 2; \
    382		vpxor RK3,	x3 ## 2, x3 ## 2;
    383
    384#define LK2(x0, x1, x2, x3, x4, i) \
    385	vpslld $13,		x0 ## 1, x4 ## 1;          \
    386	vpsrld $(32 - 13),	x0 ## 1, x0 ## 1;          \
    387	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
    388	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
    389	vpslld $3,		x2 ## 1, x4 ## 1;          \
    390	vpsrld $(32 - 3),	x2 ## 1, x2 ## 1;          \
    391	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
    392	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
    393		vpslld $13,		x0 ## 2, x4 ## 2;          \
    394		vpsrld $(32 - 13),	x0 ## 2, x0 ## 2;          \
    395		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
    396		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
    397		vpslld $3,		x2 ## 2, x4 ## 2;          \
    398		vpsrld $(32 - 3),	x2 ## 2, x2 ## 2;          \
    399		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
    400		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
    401	vpslld $1,		x1 ## 1, x4 ## 1;          \
    402	vpsrld $(32 - 1),	x1 ## 1, x1 ## 1;          \
    403	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
    404	vpslld $3,		x0 ## 1, x4 ## 1;          \
    405	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
    406	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
    407	get_key(i, 1, RK1); \
    408		vpslld $1,		x1 ## 2, x4 ## 2;          \
    409		vpsrld $(32 - 1),	x1 ## 2, x1 ## 2;          \
    410		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
    411		vpslld $3,		x0 ## 2, x4 ## 2;          \
    412		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
    413		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
    414		get_key(i, 3, RK3); \
    415	vpslld $7,		x3 ## 1, x4 ## 1;          \
    416	vpsrld $(32 - 7),	x3 ## 1, x3 ## 1;          \
    417	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
    418	vpslld $7,		x1 ## 1, x4 ## 1;          \
    419	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
    420	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
    421	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
    422	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
    423	get_key(i, 0, RK0); \
    424		vpslld $7,		x3 ## 2, x4 ## 2;          \
    425		vpsrld $(32 - 7),	x3 ## 2, x3 ## 2;          \
    426		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
    427		vpslld $7,		x1 ## 2, x4 ## 2;          \
    428		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
    429		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
    430		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
    431		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
    432		get_key(i, 2, RK2); \
    433	vpxor			RK1, x1 ## 1, x1 ## 1;     \
    434	vpxor			RK3, x3 ## 1, x3 ## 1;     \
    435	vpslld $5,		x0 ## 1, x4 ## 1;          \
    436	vpsrld $(32 - 5),	x0 ## 1, x0 ## 1;          \
    437	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
    438	vpslld $22,		x2 ## 1, x4 ## 1;          \
    439	vpsrld $(32 - 22),	x2 ## 1, x2 ## 1;          \
    440	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
    441	vpxor			RK0, x0 ## 1, x0 ## 1;     \
    442	vpxor			RK2, x2 ## 1, x2 ## 1;     \
    443		vpxor			RK1, x1 ## 2, x1 ## 2;     \
    444		vpxor			RK3, x3 ## 2, x3 ## 2;     \
    445		vpslld $5,		x0 ## 2, x4 ## 2;          \
    446		vpsrld $(32 - 5),	x0 ## 2, x0 ## 2;          \
    447		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
    448		vpslld $22,		x2 ## 2, x4 ## 2;          \
    449		vpsrld $(32 - 22),	x2 ## 2, x2 ## 2;          \
    450		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
    451		vpxor			RK0, x0 ## 2, x0 ## 2;     \
    452		vpxor			RK2, x2 ## 2, x2 ## 2;
    453
    454#define KL2(x0, x1, x2, x3, x4, i) \
    455	vpxor			RK0, x0 ## 1, x0 ## 1;     \
    456	vpxor			RK2, x2 ## 1, x2 ## 1;     \
    457	vpsrld $5,		x0 ## 1, x4 ## 1;          \
    458	vpslld $(32 - 5),	x0 ## 1, x0 ## 1;          \
    459	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
    460	vpxor			RK3, x3 ## 1, x3 ## 1;     \
    461	vpxor			RK1, x1 ## 1, x1 ## 1;     \
    462	vpsrld $22,		x2 ## 1, x4 ## 1;          \
    463	vpslld $(32 - 22),	x2 ## 1, x2 ## 1;          \
    464	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
    465	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
    466		vpxor			RK0, x0 ## 2, x0 ## 2;     \
    467		vpxor			RK2, x2 ## 2, x2 ## 2;     \
    468		vpsrld $5,		x0 ## 2, x4 ## 2;          \
    469		vpslld $(32 - 5),	x0 ## 2, x0 ## 2;          \
    470		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
    471		vpxor			RK3, x3 ## 2, x3 ## 2;     \
    472		vpxor			RK1, x1 ## 2, x1 ## 2;     \
    473		vpsrld $22,		x2 ## 2, x4 ## 2;          \
    474		vpslld $(32 - 22),	x2 ## 2, x2 ## 2;          \
    475		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
    476		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
    477	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
    478	vpslld $7,		x1 ## 1, x4 ## 1;          \
    479	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
    480	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
    481	vpsrld $1,		x1 ## 1, x4 ## 1;          \
    482	vpslld $(32 - 1),	x1 ## 1, x1 ## 1;          \
    483	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
    484		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
    485		vpslld $7,		x1 ## 2, x4 ## 2;          \
    486		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
    487		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
    488		vpsrld $1,		x1 ## 2, x4 ## 2;          \
    489		vpslld $(32 - 1),	x1 ## 2, x1 ## 2;          \
    490		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
    491	vpsrld $7,		x3 ## 1, x4 ## 1;          \
    492	vpslld $(32 - 7),	x3 ## 1, x3 ## 1;          \
    493	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
    494	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
    495	vpslld $3,		x0 ## 1, x4 ## 1;          \
    496	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
    497		vpsrld $7,		x3 ## 2, x4 ## 2;          \
    498		vpslld $(32 - 7),	x3 ## 2, x3 ## 2;          \
    499		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
    500		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
    501		vpslld $3,		x0 ## 2, x4 ## 2;          \
    502		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
    503	vpsrld $13,		x0 ## 1, x4 ## 1;          \
    504	vpslld $(32 - 13),	x0 ## 1, x0 ## 1;          \
    505	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
    506	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
    507	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
    508	vpsrld $3,		x2 ## 1, x4 ## 1;          \
    509	vpslld $(32 - 3),	x2 ## 1, x2 ## 1;          \
    510	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
    511		vpsrld $13,		x0 ## 2, x4 ## 2;          \
    512		vpslld $(32 - 13),	x0 ## 2, x0 ## 2;          \
    513		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
    514		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
    515		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
    516		vpsrld $3,		x2 ## 2, x4 ## 2;          \
    517		vpslld $(32 - 3),	x2 ## 2, x2 ## 2;          \
    518		vpor			x4 ## 2, x2 ## 2, x2 ## 2;
    519
    520#define S(SBOX, x0, x1, x2, x3, x4) \
    521	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
    522	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
    523	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
    524	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
    525
    526#define SP(SBOX, x0, x1, x2, x3, x4, i) \
    527	get_key(i, 0, RK0); \
    528	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
    529	get_key(i, 2, RK2); \
    530	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
    531	get_key(i, 3, RK3); \
    532	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
    533	get_key(i, 1, RK1); \
    534	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
    535
    536#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
    537	vpunpckldq		x1, x0, t0; \
    538	vpunpckhdq		x1, x0, t2; \
    539	vpunpckldq		x3, x2, t1; \
    540	vpunpckhdq		x3, x2, x3; \
    541	\
    542	vpunpcklqdq		t1, t0, x0; \
    543	vpunpckhqdq		t1, t0, x1; \
    544	vpunpcklqdq		x3, t2, x2; \
    545	vpunpckhqdq		x3, t2, x3;
    546
    547#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
    548	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
    549
    550#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
    551	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
    552
    553.align 8
    554SYM_FUNC_START_LOCAL(__serpent_enc_blk16)
    555	/* input:
    556	 *	%rdi: ctx, CTX
    557	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
    558	 * output:
    559	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
    560	 */
    561
    562	vpcmpeqd RNOT, RNOT, RNOT;
    563
    564	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
    565	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
    566
    567						 K2(RA, RB, RC, RD, RE, 0);
    568	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
    569	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2);
    570	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3);
    571	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4);
    572	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5);
    573	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6);
    574	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7);
    575	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8);
    576	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9);
    577	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10);
    578	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11);
    579	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12);
    580	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13);
    581	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14);
    582	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15);
    583	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16);
    584	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17);
    585	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18);
    586	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19);
    587	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20);
    588	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21);
    589	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22);
    590	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23);
    591	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24);
    592	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25);
    593	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26);
    594	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27);
    595	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28);
    596	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29);
    597	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30);
    598	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
    599	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
    600
    601	write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
    602	write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
    603
    604	RET;
    605SYM_FUNC_END(__serpent_enc_blk16)
    606
    607.align 8
    608SYM_FUNC_START_LOCAL(__serpent_dec_blk16)
    609	/* input:
    610	 *	%rdi: ctx, CTX
    611	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
    612	 * output:
    613	 *	RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
    614	 */
    615
    616	vpcmpeqd RNOT, RNOT, RNOT;
    617
    618	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
    619	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
    620
    621						 K2(RA, RB, RC, RD, RE, 32);
    622	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
    623	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30);
    624	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29);
    625	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28);
    626	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27);
    627	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26);
    628	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25);
    629	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24);
    630	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23);
    631	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22);
    632	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21);
    633	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20);
    634	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19);
    635	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18);
    636	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17);
    637	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16);
    638	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15);
    639	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14);
    640	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13);
    641	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12);
    642	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11);
    643	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10);
    644	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9);
    645	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8);
    646	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7);
    647	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6);
    648	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5);
    649	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4);
    650	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3);
    651	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2);
    652	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
    653	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
    654
    655	write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
    656	write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
    657
    658	RET;
    659SYM_FUNC_END(__serpent_dec_blk16)
    660
    661SYM_FUNC_START(serpent_ecb_enc_16way)
    662	/* input:
    663	 *	%rdi: ctx, CTX
    664	 *	%rsi: dst
    665	 *	%rdx: src
    666	 */
    667	FRAME_BEGIN
    668
    669	vzeroupper;
    670
    671	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
    672
    673	call __serpent_enc_blk16;
    674
    675	store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
    676
    677	vzeroupper;
    678
    679	FRAME_END
    680	RET;
    681SYM_FUNC_END(serpent_ecb_enc_16way)
    682
    683SYM_FUNC_START(serpent_ecb_dec_16way)
    684	/* input:
    685	 *	%rdi: ctx, CTX
    686	 *	%rsi: dst
    687	 *	%rdx: src
    688	 */
    689	FRAME_BEGIN
    690
    691	vzeroupper;
    692
    693	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
    694
    695	call __serpent_dec_blk16;
    696
    697	store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
    698
    699	vzeroupper;
    700
    701	FRAME_END
    702	RET;
    703SYM_FUNC_END(serpent_ecb_dec_16way)
    704
    705SYM_FUNC_START(serpent_cbc_dec_16way)
    706	/* input:
    707	 *	%rdi: ctx, CTX
    708	 *	%rsi: dst
    709	 *	%rdx: src
    710	 */
    711	FRAME_BEGIN
    712
    713	vzeroupper;
    714
    715	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
    716
    717	call __serpent_dec_blk16;
    718
    719	store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
    720			RK0);
    721
    722	vzeroupper;
    723
    724	FRAME_END
    725	RET;
    726SYM_FUNC_END(serpent_cbc_dec_16way)