cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

checksum_64.S (8149B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * This file contains assembly-language implementations
      4 * of IP-style 1's complement checksum routines.
      5 *	
      6 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
      7 *
      8 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
      9 */
     10
     11#include <linux/sys.h>
     12#include <asm/processor.h>
     13#include <asm/errno.h>
     14#include <asm/ppc_asm.h>
     15#include <asm/export.h>
     16
     17/*
     18 * Computes the checksum of a memory block at buff, length len,
     19 * and adds in "sum" (32-bit).
     20 *
     21 * __csum_partial(r3=buff, r4=len, r5=sum)
     22 */
     23_GLOBAL(__csum_partial)
     24	addic	r0,r5,0			/* clear carry */
     25
     26	srdi.	r6,r4,3			/* less than 8 bytes? */
     27	beq	.Lcsum_tail_word
     28
     29	/*
     30	 * If only halfword aligned, align to a double word. Since odd
     31	 * aligned addresses should be rare and they would require more
     32	 * work to calculate the correct checksum, we ignore that case
     33	 * and take the potential slowdown of unaligned loads.
     34	 */
     35	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
     36	beq	.Lcsum_aligned
     37
     38	li	r7,4
     39	sub	r6,r7,r6
     40	mtctr	r6
     41
     421:
     43	lhz	r6,0(r3)		/* align to doubleword */
     44	subi	r4,r4,2
     45	addi	r3,r3,2
     46	adde	r0,r0,r6
     47	bdnz	1b
     48
     49.Lcsum_aligned:
     50	/*
     51	 * We unroll the loop such that each iteration is 64 bytes with an
     52	 * entry and exit limb of 64 bytes, meaning a minimum size of
     53	 * 128 bytes.
     54	 */
     55	srdi.	r6,r4,7
     56	beq	.Lcsum_tail_doublewords		/* len < 128 */
     57
     58	srdi	r6,r4,6
     59	subi	r6,r6,1
     60	mtctr	r6
     61
     62	stdu	r1,-STACKFRAMESIZE(r1)
     63	std	r14,STK_REG(R14)(r1)
     64	std	r15,STK_REG(R15)(r1)
     65	std	r16,STK_REG(R16)(r1)
     66
     67	ld	r6,0(r3)
     68	ld	r9,8(r3)
     69
     70	ld	r10,16(r3)
     71	ld	r11,24(r3)
     72
     73	/*
     74	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
     75	 * because of the XER dependency. This means the fastest this loop can
     76	 * go is 16 cycles per iteration. The scheduling of the loop below has
     77	 * been shown to hit this on both POWER6 and POWER7.
     78	 */
     79	.align 5
     802:
     81	adde	r0,r0,r6
     82	ld	r12,32(r3)
     83	ld	r14,40(r3)
     84
     85	adde	r0,r0,r9
     86	ld	r15,48(r3)
     87	ld	r16,56(r3)
     88	addi	r3,r3,64
     89
     90	adde	r0,r0,r10
     91
     92	adde	r0,r0,r11
     93
     94	adde	r0,r0,r12
     95
     96	adde	r0,r0,r14
     97
     98	adde	r0,r0,r15
     99	ld	r6,0(r3)
    100	ld	r9,8(r3)
    101
    102	adde	r0,r0,r16
    103	ld	r10,16(r3)
    104	ld	r11,24(r3)
    105	bdnz	2b
    106
    107
    108	adde	r0,r0,r6
    109	ld	r12,32(r3)
    110	ld	r14,40(r3)
    111
    112	adde	r0,r0,r9
    113	ld	r15,48(r3)
    114	ld	r16,56(r3)
    115	addi	r3,r3,64
    116
    117	adde	r0,r0,r10
    118	adde	r0,r0,r11
    119	adde	r0,r0,r12
    120	adde	r0,r0,r14
    121	adde	r0,r0,r15
    122	adde	r0,r0,r16
    123
    124	ld	r14,STK_REG(R14)(r1)
    125	ld	r15,STK_REG(R15)(r1)
    126	ld	r16,STK_REG(R16)(r1)
    127	addi	r1,r1,STACKFRAMESIZE
    128
    129	andi.	r4,r4,63
    130
    131.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
    132	srdi.	r6,r4,3
    133	beq	.Lcsum_tail_word
    134
    135	mtctr	r6
    1363:
    137	ld	r6,0(r3)
    138	addi	r3,r3,8
    139	adde	r0,r0,r6
    140	bdnz	3b
    141
    142	andi.	r4,r4,7
    143
    144.Lcsum_tail_word:			/* Up to 7 bytes to go */
    145	srdi.	r6,r4,2
    146	beq	.Lcsum_tail_halfword
    147
    148	lwz	r6,0(r3)
    149	addi	r3,r3,4
    150	adde	r0,r0,r6
    151	subi	r4,r4,4
    152
    153.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
    154	srdi.	r6,r4,1
    155	beq	.Lcsum_tail_byte
    156
    157	lhz	r6,0(r3)
    158	addi	r3,r3,2
    159	adde	r0,r0,r6
    160	subi	r4,r4,2
    161
    162.Lcsum_tail_byte:			/* Up to 1 byte to go */
    163	andi.	r6,r4,1
    164	beq	.Lcsum_finish
    165
    166	lbz	r6,0(r3)
    167#ifdef __BIG_ENDIAN__
    168	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
    169	adde	r0,r0,r9
    170#else
    171	adde	r0,r0,r6
    172#endif
    173
    174.Lcsum_finish:
    175	addze	r0,r0			/* add in final carry */
    176	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
    177	add	r3,r4,r0
    178	srdi	r3,r3,32
    179	blr
    180EXPORT_SYMBOL(__csum_partial)
    181
    182
    183	.macro srcnr
    184100:
    185	EX_TABLE(100b,.Lerror_nr)
    186	.endm
    187
    188	.macro source
    189150:
    190	EX_TABLE(150b,.Lerror)
    191	.endm
    192
    193	.macro dstnr
    194200:
    195	EX_TABLE(200b,.Lerror_nr)
    196	.endm
    197
    198	.macro dest
    199250:
    200	EX_TABLE(250b,.Lerror)
    201	.endm
    202
    203/*
    204 * Computes the checksum of a memory block at src, length len,
    205 * and adds in 0xffffffff (32-bit), while copying the block to dst.
    206 * If an access exception occurs, it returns 0.
    207 *
    208 * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
    209 */
    210_GLOBAL(csum_partial_copy_generic)
    211	li	r6,-1
    212	addic	r0,r6,0			/* clear carry */
    213
    214	srdi.	r6,r5,3			/* less than 8 bytes? */
    215	beq	.Lcopy_tail_word
    216
    217	/*
    218	 * If only halfword aligned, align to a double word. Since odd
    219	 * aligned addresses should be rare and they would require more
    220	 * work to calculate the correct checksum, we ignore that case
    221	 * and take the potential slowdown of unaligned loads.
    222	 *
    223	 * If the source and destination are relatively unaligned we only
    224	 * align the source. This keeps things simple.
    225	 */
    226	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
    227	beq	.Lcopy_aligned
    228
    229	li	r9,4
    230	sub	r6,r9,r6
    231	mtctr	r6
    232
    2331:
    234srcnr;	lhz	r6,0(r3)		/* align to doubleword */
    235	subi	r5,r5,2
    236	addi	r3,r3,2
    237	adde	r0,r0,r6
    238dstnr;	sth	r6,0(r4)
    239	addi	r4,r4,2
    240	bdnz	1b
    241
    242.Lcopy_aligned:
    243	/*
    244	 * We unroll the loop such that each iteration is 64 bytes with an
    245	 * entry and exit limb of 64 bytes, meaning a minimum size of
    246	 * 128 bytes.
    247	 */
    248	srdi.	r6,r5,7
    249	beq	.Lcopy_tail_doublewords		/* len < 128 */
    250
    251	srdi	r6,r5,6
    252	subi	r6,r6,1
    253	mtctr	r6
    254
    255	stdu	r1,-STACKFRAMESIZE(r1)
    256	std	r14,STK_REG(R14)(r1)
    257	std	r15,STK_REG(R15)(r1)
    258	std	r16,STK_REG(R16)(r1)
    259
    260source;	ld	r6,0(r3)
    261source;	ld	r9,8(r3)
    262
    263source;	ld	r10,16(r3)
    264source;	ld	r11,24(r3)
    265
    266	/*
    267	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
    268	 * because of the XER dependency. This means the fastest this loop can
    269	 * go is 16 cycles per iteration. The scheduling of the loop below has
    270	 * been shown to hit this on both POWER6 and POWER7.
    271	 */
    272	.align 5
    2732:
    274	adde	r0,r0,r6
    275source;	ld	r12,32(r3)
    276source;	ld	r14,40(r3)
    277
    278	adde	r0,r0,r9
    279source;	ld	r15,48(r3)
    280source;	ld	r16,56(r3)
    281	addi	r3,r3,64
    282
    283	adde	r0,r0,r10
    284dest;	std	r6,0(r4)
    285dest;	std	r9,8(r4)
    286
    287	adde	r0,r0,r11
    288dest;	std	r10,16(r4)
    289dest;	std	r11,24(r4)
    290
    291	adde	r0,r0,r12
    292dest;	std	r12,32(r4)
    293dest;	std	r14,40(r4)
    294
    295	adde	r0,r0,r14
    296dest;	std	r15,48(r4)
    297dest;	std	r16,56(r4)
    298	addi	r4,r4,64
    299
    300	adde	r0,r0,r15
    301source;	ld	r6,0(r3)
    302source;	ld	r9,8(r3)
    303
    304	adde	r0,r0,r16
    305source;	ld	r10,16(r3)
    306source;	ld	r11,24(r3)
    307	bdnz	2b
    308
    309
    310	adde	r0,r0,r6
    311source;	ld	r12,32(r3)
    312source;	ld	r14,40(r3)
    313
    314	adde	r0,r0,r9
    315source;	ld	r15,48(r3)
    316source;	ld	r16,56(r3)
    317	addi	r3,r3,64
    318
    319	adde	r0,r0,r10
    320dest;	std	r6,0(r4)
    321dest;	std	r9,8(r4)
    322
    323	adde	r0,r0,r11
    324dest;	std	r10,16(r4)
    325dest;	std	r11,24(r4)
    326
    327	adde	r0,r0,r12
    328dest;	std	r12,32(r4)
    329dest;	std	r14,40(r4)
    330
    331	adde	r0,r0,r14
    332dest;	std	r15,48(r4)
    333dest;	std	r16,56(r4)
    334	addi	r4,r4,64
    335
    336	adde	r0,r0,r15
    337	adde	r0,r0,r16
    338
    339	ld	r14,STK_REG(R14)(r1)
    340	ld	r15,STK_REG(R15)(r1)
    341	ld	r16,STK_REG(R16)(r1)
    342	addi	r1,r1,STACKFRAMESIZE
    343
    344	andi.	r5,r5,63
    345
    346.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
    347	srdi.	r6,r5,3
    348	beq	.Lcopy_tail_word
    349
    350	mtctr	r6
    3513:
    352srcnr;	ld	r6,0(r3)
    353	addi	r3,r3,8
    354	adde	r0,r0,r6
    355dstnr;	std	r6,0(r4)
    356	addi	r4,r4,8
    357	bdnz	3b
    358
    359	andi.	r5,r5,7
    360
    361.Lcopy_tail_word:			/* Up to 7 bytes to go */
    362	srdi.	r6,r5,2
    363	beq	.Lcopy_tail_halfword
    364
    365srcnr;	lwz	r6,0(r3)
    366	addi	r3,r3,4
    367	adde	r0,r0,r6
    368dstnr;	stw	r6,0(r4)
    369	addi	r4,r4,4
    370	subi	r5,r5,4
    371
    372.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
    373	srdi.	r6,r5,1
    374	beq	.Lcopy_tail_byte
    375
    376srcnr;	lhz	r6,0(r3)
    377	addi	r3,r3,2
    378	adde	r0,r0,r6
    379dstnr;	sth	r6,0(r4)
    380	addi	r4,r4,2
    381	subi	r5,r5,2
    382
    383.Lcopy_tail_byte:			/* Up to 1 byte to go */
    384	andi.	r6,r5,1
    385	beq	.Lcopy_finish
    386
    387srcnr;	lbz	r6,0(r3)
    388#ifdef __BIG_ENDIAN__
    389	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
    390	adde	r0,r0,r9
    391#else
    392	adde	r0,r0,r6
    393#endif
    394dstnr;	stb	r6,0(r4)
    395
    396.Lcopy_finish:
    397	addze	r0,r0			/* add in final carry */
    398	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
    399	add	r3,r4,r0
    400	srdi	r3,r3,32
    401	blr
    402
    403.Lerror:
    404	ld	r14,STK_REG(R14)(r1)
    405	ld	r15,STK_REG(R15)(r1)
    406	ld	r16,STK_REG(R16)(r1)
    407	addi	r1,r1,STACKFRAMESIZE
    408.Lerror_nr:
    409	li	r3,0
    410	blr
    411
    412EXPORT_SYMBOL(csum_partial_copy_generic)
    413
    414/*
    415 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
    416 *			   const struct in6_addr *daddr,
    417 *			   __u32 len, __u8 proto, __wsum sum)
    418 */
    419
    420_GLOBAL(csum_ipv6_magic)
    421	ld	r8, 0(r3)
    422	ld	r9, 8(r3)
    423	add	r5, r5, r6
    424	addc	r0, r8, r9
    425	ld	r10, 0(r4)
    426	ld	r11, 8(r4)
    427#ifdef CONFIG_CPU_LITTLE_ENDIAN
    428	rotldi	r5, r5, 8
    429#endif
    430	adde	r0, r0, r10
    431	add	r5, r5, r7
    432	adde	r0, r0, r11
    433	adde	r0, r0, r5
    434	addze	r0, r0
    435	rotldi  r3, r0, 32		/* fold two 32 bit halves together */
    436	add	r3, r0, r3
    437	srdi	r0, r3, 32
    438	rotlwi	r3, r0, 16		/* fold two 16 bit halves together */
    439	add	r3, r0, r3
    440	not	r3, r3
    441	rlwinm	r3, r3, 16, 16, 31
    442	blr
    443EXPORT_SYMBOL(csum_ipv6_magic)