cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

crc32c-pcl-intel-asm_64.S (13390B)


      1/*
      2 * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
      3 *
      4 * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
      5 * downloaded from:
      6 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
      7 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
      8 *
      9 * Copyright (C) 2012 Intel Corporation.
     10 *
     11 * Authors:
     12 *	Wajdi Feghali <wajdi.k.feghali@intel.com>
     13 *	James Guilford <james.guilford@intel.com>
     14 *	David Cote <david.m.cote@intel.com>
     15 *	Tim Chen <tim.c.chen@linux.intel.com>
     16 *
     17 * This software is available to you under a choice of one of two
     18 * licenses.  You may choose to be licensed under the terms of the GNU
     19 * General Public License (GPL) Version 2, available from the file
     20 * COPYING in the main directory of this source tree, or the
     21 * OpenIB.org BSD license below:
     22 *
     23 *     Redistribution and use in source and binary forms, with or
     24 *     without modification, are permitted provided that the following
     25 *     conditions are met:
     26 *
     27 *      - Redistributions of source code must retain the above
     28 *        copyright notice, this list of conditions and the following
     29 *        disclaimer.
     30 *
     31 *      - Redistributions in binary form must reproduce the above
     32 *        copyright notice, this list of conditions and the following
     33 *        disclaimer in the documentation and/or other materials
     34 *        provided with the distribution.
     35 *
     36 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     37 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     38 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     39 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     40 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     41 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     42 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     43 * SOFTWARE.
     44 */
     45
     46#include <linux/linkage.h>
     47#include <asm/nospec-branch.h>
     48
     49## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
     50
     51.macro LABEL prefix n
     52\prefix\n\():
     53.endm
     54
     55.macro JMPTBL_ENTRY i
     56.quad crc_\i
     57.endm
     58
     59.macro JNC_LESS_THAN j
     60	jnc less_than_\j
     61.endm
     62
     63# Define threshold where buffers are considered "small" and routed to more
     64# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
     65# SMALL_SIZE can be no larger than 255.
     66
     67#define SMALL_SIZE 200
     68
     69.if (SMALL_SIZE > 255)
     70.error "SMALL_ SIZE must be < 256"
     71.endif
     72
     73# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
     74
     75.text
     76SYM_FUNC_START(crc_pcl)
     77#define    bufp		rdi
     78#define    bufp_dw	%edi
     79#define    bufp_w	%di
     80#define    bufp_b	%dil
     81#define    bufptmp	%rcx
     82#define    block_0	%rcx
     83#define    block_1	%rdx
     84#define    block_2	%r11
     85#define    len		%rsi
     86#define    len_dw	%esi
     87#define    len_w	%si
     88#define    len_b	%sil
     89#define    crc_init_arg %rdx
     90#define    tmp		%rbx
     91#define    crc_init	%r8
     92#define    crc_init_dw	%r8d
     93#define    crc1		%r9
     94#define    crc2		%r10
     95
     96	pushq   %rbx
     97	pushq   %rdi
     98	pushq   %rsi
     99
    100	## Move crc_init for Linux to a different
    101	mov     crc_init_arg, crc_init
    102
    103	################################################################
    104	## 1) ALIGN:
    105	################################################################
    106
    107	mov     %bufp, bufptmp		# rdi = *buf
    108	neg     %bufp
    109	and     $7, %bufp		# calculate the unalignment amount of
    110					# the address
    111	je      proc_block		# Skip if aligned
    112
    113	## If len is less than 8 and we're unaligned, we need to jump
    114	## to special code to avoid reading beyond the end of the buffer
    115	cmp     $8, len
    116	jae     do_align
    117	# less_than_8 expects length in upper 3 bits of len_dw
    118	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
    119	shl     $32-3+1, len_dw
    120	jmp     less_than_8_post_shl1
    121
    122do_align:
    123	#### Calculate CRC of unaligned bytes of the buffer (if any)
    124	movq    (bufptmp), tmp		# load a quadward from the buffer
    125	add     %bufp, bufptmp		# align buffer pointer for quadword
    126					# processing
    127	sub     %bufp, len		# update buffer length
    128align_loop:
    129	crc32b  %bl, crc_init_dw 	# compute crc32 of 1-byte
    130	shr     $8, tmp			# get next byte
    131	dec     %bufp
    132	jne     align_loop
    133
    134proc_block:
    135
    136	################################################################
    137	## 2) PROCESS  BLOCKS:
    138	################################################################
    139
    140	## compute num of bytes to be processed
    141	movq    len, tmp		# save num bytes in tmp
    142
    143	cmpq    $128*24, len
    144	jae     full_block
    145
    146continue_block:
    147	cmpq    $SMALL_SIZE, len
    148	jb      small
    149
    150	## len < 128*24
    151	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
    152	mul     len_dw
    153	shrq    $16, %rax
    154
    155	## eax contains floor(bytes / 24) = num 24-byte chunks to do
    156
    157	## process rax 24-byte chunks (128 >= rax >= 0)
    158
    159	## compute end address of each block
    160	## block 0 (base addr + RAX * 8)
    161	## block 1 (base addr + RAX * 16)
    162	## block 2 (base addr + RAX * 24)
    163	lea     (bufptmp, %rax, 8), block_0
    164	lea     (block_0, %rax, 8), block_1
    165	lea     (block_1, %rax, 8), block_2
    166
    167	xor     crc1, crc1
    168	xor     crc2, crc2
    169
    170	## branch into array
    171	mov	jump_table(,%rax,8), %bufp
    172	JMP_NOSPEC bufp
    173
    174	################################################################
    175	## 2a) PROCESS FULL BLOCKS:
    176	################################################################
    177full_block:
    178	movl    $128,%eax
    179	lea     128*8*2(block_0), block_1
    180	lea     128*8*3(block_0), block_2
    181	add     $128*8*1, block_0
    182
    183	xor     crc1,crc1
    184	xor     crc2,crc2
    185
    186	# Fall thruogh into top of crc array (crc_128)
    187
    188	################################################################
    189	## 3) CRC Array:
    190	################################################################
    191
    192crc_array:
    193	i=128
    194.rept 128-1
    195.altmacro
    196LABEL crc_ %i
    197.noaltmacro
    198	ENDBR
    199	crc32q   -i*8(block_0), crc_init
    200	crc32q   -i*8(block_1), crc1
    201	crc32q   -i*8(block_2), crc2
    202	i=(i-1)
    203.endr
    204
    205.altmacro
    206LABEL crc_ %i
    207.noaltmacro
    208	ENDBR
    209	crc32q   -i*8(block_0), crc_init
    210	crc32q   -i*8(block_1), crc1
    211# SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
    212
    213	mov     block_2, block_0
    214
    215	################################################################
    216	## 4) Combine three results:
    217	################################################################
    218
    219	lea	(K_table-8)(%rip), %bufp		# first entry is for idx 1
    220	shlq    $3, %rax			# rax *= 8
    221	pmovzxdq (%bufp,%rax), %xmm0		# 2 consts: K1:K2
    222	leal	(%eax,%eax,2), %eax		# rax *= 3 (total *24)
    223	subq    %rax, tmp			# tmp -= rax*24
    224
    225	movq    crc_init, %xmm1			# CRC for block 1
    226	pclmulqdq $0x00, %xmm0, %xmm1		# Multiply by K2
    227
    228	movq    crc1, %xmm2			# CRC for block 2
    229	pclmulqdq $0x10, %xmm0, %xmm2		# Multiply by K1
    230
    231	pxor    %xmm2,%xmm1
    232	movq    %xmm1, %rax
    233	xor     -i*8(block_2), %rax
    234	mov     crc2, crc_init
    235	crc32   %rax, crc_init
    236
    237	################################################################
    238	## 5) Check for end:
    239	################################################################
    240
    241LABEL crc_ 0
    242	ENDBR
    243	mov     tmp, len
    244	cmp     $128*24, tmp
    245	jae     full_block
    246	cmp     $24, tmp
    247	jae     continue_block
    248
    249less_than_24:
    250	shl     $32-4, len_dw			# less_than_16 expects length
    251						# in upper 4 bits of len_dw
    252	jnc     less_than_16
    253	crc32q  (bufptmp), crc_init
    254	crc32q  8(bufptmp), crc_init
    255	jz      do_return
    256	add     $16, bufptmp
    257	# len is less than 8 if we got here
    258	# less_than_8 expects length in upper 3 bits of len_dw
    259	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
    260	shl     $2, len_dw
    261	jmp     less_than_8_post_shl1
    262
    263	#######################################################################
    264	## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
    265	#######################################################################
    266small:
    267	shl $32-8, len_dw		# Prepare len_dw for less_than_256
    268	j=256
    269.rept 5					# j = {256, 128, 64, 32, 16}
    270.altmacro
    271LABEL less_than_ %j			# less_than_j: Length should be in
    272					# upper lg(j) bits of len_dw
    273	j=(j/2)
    274	shl     $1, len_dw		# Get next MSB
    275	JNC_LESS_THAN %j
    276.noaltmacro
    277	i=0
    278.rept (j/8)
    279	crc32q  i(bufptmp), crc_init	# Compute crc32 of 8-byte data
    280	i=i+8
    281.endr
    282	jz      do_return		# Return if remaining length is zero
    283	add     $j, bufptmp		# Advance buf
    284.endr
    285
    286less_than_8:				# Length should be stored in
    287					# upper 3 bits of len_dw
    288	shl     $1, len_dw
    289less_than_8_post_shl1:
    290	jnc     less_than_4
    291	crc32l  (bufptmp), crc_init_dw	# CRC of 4 bytes
    292	jz      do_return		# return if remaining data is zero
    293	add     $4, bufptmp
    294less_than_4:				# Length should be stored in
    295					# upper 2 bits of len_dw
    296	shl     $1, len_dw
    297	jnc     less_than_2
    298	crc32w  (bufptmp), crc_init_dw	# CRC of 2 bytes
    299	jz      do_return		# return if remaining data is zero
    300	add     $2, bufptmp
    301less_than_2:				# Length should be stored in the MSB
    302					# of len_dw
    303	shl     $1, len_dw
    304	jnc     less_than_1
    305	crc32b  (bufptmp), crc_init_dw	# CRC of 1 byte
    306less_than_1:				# Length should be zero
    307do_return:
    308	movq    crc_init, %rax
    309	popq    %rsi
    310	popq    %rdi
    311	popq    %rbx
    312        RET
    313SYM_FUNC_END(crc_pcl)
    314
    315.section	.rodata, "a", @progbits
    316        ################################################################
    317        ## jump table        Table is 129 entries x 2 bytes each
    318        ################################################################
    319.align 4
    320jump_table:
    321	i=0
    322.rept 129
    323.altmacro
    324JMPTBL_ENTRY %i
    325.noaltmacro
    326	i=i+1
    327.endr
    328
    329
    330	################################################################
    331	## PCLMULQDQ tables
    332	## Table is 128 entries x 2 words (8 bytes) each
    333	################################################################
    334.align 8
    335K_table:
    336	.long 0x493c7d27, 0x00000001
    337	.long 0xba4fc28e, 0x493c7d27
    338	.long 0xddc0152b, 0xf20c0dfe
    339	.long 0x9e4addf8, 0xba4fc28e
    340	.long 0x39d3b296, 0x3da6d0cb
    341	.long 0x0715ce53, 0xddc0152b
    342	.long 0x47db8317, 0x1c291d04
    343	.long 0x0d3b6092, 0x9e4addf8
    344	.long 0xc96cfdc0, 0x740eef02
    345	.long 0x878a92a7, 0x39d3b296
    346	.long 0xdaece73e, 0x083a6eec
    347	.long 0xab7aff2a, 0x0715ce53
    348	.long 0x2162d385, 0xc49f4f67
    349	.long 0x83348832, 0x47db8317
    350	.long 0x299847d5, 0x2ad91c30
    351	.long 0xb9e02b86, 0x0d3b6092
    352	.long 0x18b33a4e, 0x6992cea2
    353	.long 0xb6dd949b, 0xc96cfdc0
    354	.long 0x78d9ccb7, 0x7e908048
    355	.long 0xbac2fd7b, 0x878a92a7
    356	.long 0xa60ce07b, 0x1b3d8f29
    357	.long 0xce7f39f4, 0xdaece73e
    358	.long 0x61d82e56, 0xf1d0f55e
    359	.long 0xd270f1a2, 0xab7aff2a
    360	.long 0xc619809d, 0xa87ab8a8
    361	.long 0x2b3cac5d, 0x2162d385
    362	.long 0x65863b64, 0x8462d800
    363	.long 0x1b03397f, 0x83348832
    364	.long 0xebb883bd, 0x71d111a8
    365	.long 0xb3e32c28, 0x299847d5
    366	.long 0x064f7f26, 0xffd852c6
    367	.long 0xdd7e3b0c, 0xb9e02b86
    368	.long 0xf285651c, 0xdcb17aa4
    369	.long 0x10746f3c, 0x18b33a4e
    370	.long 0xc7a68855, 0xf37c5aee
    371	.long 0x271d9844, 0xb6dd949b
    372	.long 0x8e766a0c, 0x6051d5a2
    373	.long 0x93a5f730, 0x78d9ccb7
    374	.long 0x6cb08e5c, 0x18b0d4ff
    375	.long 0x6b749fb2, 0xbac2fd7b
    376	.long 0x1393e203, 0x21f3d99c
    377	.long 0xcec3662e, 0xa60ce07b
    378	.long 0x96c515bb, 0x8f158014
    379	.long 0xe6fc4e6a, 0xce7f39f4
    380	.long 0x8227bb8a, 0xa00457f7
    381	.long 0xb0cd4768, 0x61d82e56
    382	.long 0x39c7ff35, 0x8d6d2c43
    383	.long 0xd7a4825c, 0xd270f1a2
    384	.long 0x0ab3844b, 0x00ac29cf
    385	.long 0x0167d312, 0xc619809d
    386	.long 0xf6076544, 0xe9adf796
    387	.long 0x26f6a60a, 0x2b3cac5d
    388	.long 0xa741c1bf, 0x96638b34
    389	.long 0x98d8d9cb, 0x65863b64
    390	.long 0x49c3cc9c, 0xe0e9f351
    391	.long 0x68bce87a, 0x1b03397f
    392	.long 0x57a3d037, 0x9af01f2d
    393	.long 0x6956fc3b, 0xebb883bd
    394	.long 0x42d98888, 0x2cff42cf
    395	.long 0x3771e98f, 0xb3e32c28
    396	.long 0xb42ae3d9, 0x88f25a3a
    397	.long 0x2178513a, 0x064f7f26
    398	.long 0xe0ac139e, 0x4e36f0b0
    399	.long 0x170076fa, 0xdd7e3b0c
    400	.long 0x444dd413, 0xbd6f81f8
    401	.long 0x6f345e45, 0xf285651c
    402	.long 0x41d17b64, 0x91c9bd4b
    403	.long 0xff0dba97, 0x10746f3c
    404	.long 0xa2b73df1, 0x885f087b
    405	.long 0xf872e54c, 0xc7a68855
    406	.long 0x1e41e9fc, 0x4c144932
    407	.long 0x86d8e4d2, 0x271d9844
    408	.long 0x651bd98b, 0x52148f02
    409	.long 0x5bb8f1bc, 0x8e766a0c
    410	.long 0xa90fd27a, 0xa3c6f37a
    411	.long 0xb3af077a, 0x93a5f730
    412	.long 0x4984d782, 0xd7c0557f
    413	.long 0xca6ef3ac, 0x6cb08e5c
    414	.long 0x234e0b26, 0x63ded06a
    415	.long 0xdd66cbbb, 0x6b749fb2
    416	.long 0x4597456a, 0x4d56973c
    417	.long 0xe9e28eb4, 0x1393e203
    418	.long 0x7b3ff57a, 0x9669c9df
    419	.long 0xc9c8b782, 0xcec3662e
    420	.long 0x3f70cc6f, 0xe417f38a
    421	.long 0x93e106a4, 0x96c515bb
    422	.long 0x62ec6c6d, 0x4b9e0f71
    423	.long 0xd813b325, 0xe6fc4e6a
    424	.long 0x0df04680, 0xd104b8fc
    425	.long 0x2342001e, 0x8227bb8a
    426	.long 0x0a2a8d7e, 0x5b397730
    427	.long 0x6d9a4957, 0xb0cd4768
    428	.long 0xe8b6368b, 0xe78eb416
    429	.long 0xd2c3ed1a, 0x39c7ff35
    430	.long 0x995a5724, 0x61ff0e01
    431	.long 0x9ef68d35, 0xd7a4825c
    432	.long 0x0c139b31, 0x8d96551c
    433	.long 0xf2271e60, 0x0ab3844b
    434	.long 0x0b0bf8ca, 0x0bf80dd2
    435	.long 0x2664fd8b, 0x0167d312
    436	.long 0xed64812d, 0x8821abed
    437	.long 0x02ee03b2, 0xf6076544
    438	.long 0x8604ae0f, 0x6a45d2b2
    439	.long 0x363bd6b3, 0x26f6a60a
    440	.long 0x135c83fd, 0xd8d26619
    441	.long 0x5fabe670, 0xa741c1bf
    442	.long 0x35ec3279, 0xde87806c
    443	.long 0x00bcf5f6, 0x98d8d9cb
    444	.long 0x8ae00689, 0x14338754
    445	.long 0x17f27698, 0x49c3cc9c
    446	.long 0x58ca5f00, 0x5bd2011f
    447	.long 0xaa7c7ad5, 0x68bce87a
    448	.long 0xb5cfca28, 0xdd07448e
    449	.long 0xded288f8, 0x57a3d037
    450	.long 0x59f229bc, 0xdde8f5b9
    451	.long 0x6d390dec, 0x6956fc3b
    452	.long 0x37170390, 0xa3e3e02c
    453	.long 0x6353c1cc, 0x42d98888
    454	.long 0xc4584f5c, 0xd73c7bea
    455	.long 0xf48642e9, 0x3771e98f
    456	.long 0x531377e2, 0x80ff0093
    457	.long 0xdd35bc8d, 0xb42ae3d9
    458	.long 0xb25b29f2, 0x8fe4c34d
    459	.long 0x9a5ede41, 0x2178513a
    460	.long 0xa563905d, 0xdf99fc11
    461	.long 0x45cddf4e, 0xe0ac139e
    462	.long 0xacfa3103, 0x6c23e841
    463	.long 0xa51b6135, 0x170076fa