memcmp_64.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
memcmp_64.S (11692B)
      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * Author: Anton Blanchard <anton@au.ibm.com>
      4 * Copyright 2015 IBM Corporation.
      5 */
      6#include <asm/ppc_asm.h>
      7#include <asm/export.h>
      8#include <asm/ppc-opcode.h>
      9
     10#define off8	r6
     11#define off16	r7
     12#define off24	r8
     13
     14#define rA	r9
     15#define rB	r10
     16#define rC	r11
     17#define rD	r27
     18#define rE	r28
     19#define rF	r29
     20#define rG	r30
     21#define rH	r31
     22
     23#ifdef __LITTLE_ENDIAN__
     24#define LH	lhbrx
     25#define LW	lwbrx
     26#define LD	ldbrx
     27#define LVS	lvsr
     28#define VPERM(_VRT,_VRA,_VRB,_VRC) \
     29	vperm _VRT,_VRB,_VRA,_VRC
     30#else
     31#define LH	lhzx
     32#define LW	lwzx
     33#define LD	ldx
     34#define LVS	lvsl
     35#define VPERM(_VRT,_VRA,_VRB,_VRC) \
     36	vperm _VRT,_VRA,_VRB,_VRC
     37#endif
     38
     39#define VMX_THRESH 4096
     40#define ENTER_VMX_OPS	\
     41	mflr    r0;	\
     42	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
     43	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
     44	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
     45	std     r0,16(r1); \
     46	stdu    r1,-STACKFRAMESIZE(r1); \
     47	bl      enter_vmx_ops; \
     48	cmpwi   cr1,r3,0; \
     49	ld      r0,STACKFRAMESIZE+16(r1); \
     50	ld      r3,STK_REG(R31)(r1); \
     51	ld      r4,STK_REG(R30)(r1); \
     52	ld      r5,STK_REG(R29)(r1); \
     53	addi	r1,r1,STACKFRAMESIZE; \
     54	mtlr    r0
     55
     56#define EXIT_VMX_OPS \
     57	mflr    r0; \
     58	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
     59	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
     60	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
     61	std     r0,16(r1); \
     62	stdu    r1,-STACKFRAMESIZE(r1); \
     63	bl      exit_vmx_ops; \
     64	ld      r0,STACKFRAMESIZE+16(r1); \
     65	ld      r3,STK_REG(R31)(r1); \
     66	ld      r4,STK_REG(R30)(r1); \
     67	ld      r5,STK_REG(R29)(r1); \
     68	addi	r1,r1,STACKFRAMESIZE; \
     69	mtlr    r0
     70
     71/*
     72 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
     73 * 16 bytes boundary and permute the result with the 1st 16 bytes.
     74
     75 *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
     76 *    ^                                  ^                                 ^
     77 * 0xbbbb10                          0xbbbb20                          0xbbb30
     78 *                                 ^
     79 *                                _vaddr
     80 *
     81 *
     82 * _vmask is the mask generated by LVS
     83 * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
     84 *   for example: 0xyyyyyyyyyyyyy012 for big endian
     85 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
     86 *   for example: 0x3456789abcdefzzz for big endian
     87 * The permute result is saved in _v_res.
     88 *   for example: 0x0123456789abcdef for big endian.
     89 */
     90#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
     91        lvx     _v2nd_qw,_vaddr,off16; \
     92        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
     93
     94/*
     95 * There are 2 categories for memcmp:
     96 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
     97 * are named like .Lsameoffset_xxxx
     98 * 2) src/dst has different offset to the 8 bytes boundary. The handlers
     99 * are named like .Ldiffoffset_xxxx
    100 */
    101_GLOBAL_TOC(memcmp)
    102	cmpdi	cr1,r5,0
    103
    104	/* Use the short loop if the src/dst addresses are not
    105	 * with the same offset of 8 bytes align boundary.
    106	 */
    107	xor	r6,r3,r4
    108	andi.	r6,r6,7
    109
    110	/* Fall back to short loop if compare at aligned addrs
    111	 * with less than 8 bytes.
    112	 */
    113	cmpdi   cr6,r5,7
    114
    115	beq	cr1,.Lzero
    116	bgt	cr6,.Lno_short
    117
    118.Lshort:
    119	mtctr	r5
    1201:	lbz	rA,0(r3)
    121	lbz	rB,0(r4)
    122	subf.	rC,rB,rA
    123	bne	.Lnon_zero
    124	bdz	.Lzero
    125
    126	lbz	rA,1(r3)
    127	lbz	rB,1(r4)
    128	subf.	rC,rB,rA
    129	bne	.Lnon_zero
    130	bdz	.Lzero
    131
    132	lbz	rA,2(r3)
    133	lbz	rB,2(r4)
    134	subf.	rC,rB,rA
    135	bne	.Lnon_zero
    136	bdz	.Lzero
    137
    138	lbz	rA,3(r3)
    139	lbz	rB,3(r4)
    140	subf.	rC,rB,rA
    141	bne	.Lnon_zero
    142
    143	addi	r3,r3,4
    144	addi	r4,r4,4
    145
    146	bdnz	1b
    147
    148.Lzero:
    149	li	r3,0
    150	blr
    151
    152.Lno_short:
    153	dcbt	0,r3
    154	dcbt	0,r4
    155	bne	.Ldiffoffset_8bytes_make_align_start
    156
    157
    158.Lsameoffset_8bytes_make_align_start:
    159	/* attempt to compare bytes not aligned with 8 bytes so that
    160	 * rest comparison can run based on 8 bytes alignment.
    161	 */
    162	andi.   r6,r3,7
    163
    164	/* Try to compare the first double word which is not 8 bytes aligned:
    165	 * load the first double word at (src & ~7UL) and shift left appropriate
    166	 * bits before comparision.
    167	 */
    168	rlwinm  r6,r3,3,26,28
    169	beq     .Lsameoffset_8bytes_aligned
    170	clrrdi	r3,r3,3
    171	clrrdi	r4,r4,3
    172	LD	rA,0,r3
    173	LD	rB,0,r4
    174	sld	rA,rA,r6
    175	sld	rB,rB,r6
    176	cmpld	cr0,rA,rB
    177	srwi	r6,r6,3
    178	bne	cr0,.LcmpAB_lightweight
    179	subfic  r6,r6,8
    180	subf.	r5,r6,r5
    181	addi	r3,r3,8
    182	addi	r4,r4,8
    183	beq	.Lzero
    184
    185.Lsameoffset_8bytes_aligned:
    186	/* now we are aligned with 8 bytes.
    187	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
    188	 */
    189	cmpdi   cr6,r5,31
    190	bgt	cr6,.Llong
    191
    192.Lcmp_lt32bytes:
    193	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
    194	cmpdi   cr5,r5,7
    195	srdi    r0,r5,3
    196	ble	cr5,.Lcmp_rest_lt8bytes
    197
    198	/* handle 8 ~ 31 bytes */
    199	clrldi  r5,r5,61
    200	mtctr   r0
    2012:
    202	LD	rA,0,r3
    203	LD	rB,0,r4
    204	cmpld	cr0,rA,rB
    205	addi	r3,r3,8
    206	addi	r4,r4,8
    207	bne	cr0,.LcmpAB_lightweight
    208	bdnz	2b
    209
    210	cmpwi   r5,0
    211	beq	.Lzero
    212
    213.Lcmp_rest_lt8bytes:
    214	/*
    215	 * Here we have less than 8 bytes to compare. At least s1 is aligned to
    216	 * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
    217	 * page boundary, otherwise we might read past the end of the buffer and
    218	 * trigger a page fault. We use 4K as the conservative minimum page
    219	 * size. If we detect that case we go to the byte-by-byte loop.
    220	 *
    221	 * Otherwise the next double word is loaded from s1 and s2, and shifted
    222	 * right to compare the appropriate bits.
    223	 */
    224	clrldi	r6,r4,(64-12)	// r6 = r4 & 0xfff
    225	cmpdi	r6,0xff8
    226	bgt	.Lshort
    227
    228	subfic  r6,r5,8
    229	slwi	r6,r6,3
    230	LD	rA,0,r3
    231	LD	rB,0,r4
    232	srd	rA,rA,r6
    233	srd	rB,rB,r6
    234	cmpld	cr0,rA,rB
    235	bne	cr0,.LcmpAB_lightweight
    236	b	.Lzero
    237
    238.Lnon_zero:
    239	mr	r3,rC
    240	blr
    241
    242.Llong:
    243#ifdef CONFIG_ALTIVEC
    244BEGIN_FTR_SECTION
    245	/* Try to use vmx loop if length is equal or greater than 4K */
    246	cmpldi  cr6,r5,VMX_THRESH
    247	bge	cr6,.Lsameoffset_vmx_cmp
    248END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
    249
    250.Llong_novmx_cmp:
    251#endif
    252	/* At least s1 addr is aligned with 8 bytes */
    253	li	off8,8
    254	li	off16,16
    255	li	off24,24
    256
    257	std	r31,-8(r1)
    258	std	r30,-16(r1)
    259	std	r29,-24(r1)
    260	std	r28,-32(r1)
    261	std	r27,-40(r1)
    262
    263	srdi	r0,r5,5
    264	mtctr	r0
    265	andi.	r5,r5,31
    266
    267	LD	rA,0,r3
    268	LD	rB,0,r4
    269
    270	LD	rC,off8,r3
    271	LD	rD,off8,r4
    272
    273	LD	rE,off16,r3
    274	LD	rF,off16,r4
    275
    276	LD	rG,off24,r3
    277	LD	rH,off24,r4
    278	cmpld	cr0,rA,rB
    279
    280	addi	r3,r3,32
    281	addi	r4,r4,32
    282
    283	bdz	.Lfirst32
    284
    285	LD	rA,0,r3
    286	LD	rB,0,r4
    287	cmpld	cr1,rC,rD
    288
    289	LD	rC,off8,r3
    290	LD	rD,off8,r4
    291	cmpld	cr6,rE,rF
    292
    293	LD	rE,off16,r3
    294	LD	rF,off16,r4
    295	cmpld	cr7,rG,rH
    296	bne	cr0,.LcmpAB
    297
    298	LD	rG,off24,r3
    299	LD	rH,off24,r4
    300	cmpld	cr0,rA,rB
    301	bne	cr1,.LcmpCD
    302
    303	addi	r3,r3,32
    304	addi	r4,r4,32
    305
    306	bdz	.Lsecond32
    307
    308	.balign	16
    309
    3101:	LD	rA,0,r3
    311	LD	rB,0,r4
    312	cmpld	cr1,rC,rD
    313	bne	cr6,.LcmpEF
    314
    315	LD	rC,off8,r3
    316	LD	rD,off8,r4
    317	cmpld	cr6,rE,rF
    318	bne	cr7,.LcmpGH
    319
    320	LD	rE,off16,r3
    321	LD	rF,off16,r4
    322	cmpld	cr7,rG,rH
    323	bne	cr0,.LcmpAB
    324
    325	LD	rG,off24,r3
    326	LD	rH,off24,r4
    327	cmpld	cr0,rA,rB
    328	bne	cr1,.LcmpCD
    329
    330	addi	r3,r3,32
    331	addi	r4,r4,32
    332
    333	bdnz	1b
    334
    335.Lsecond32:
    336	cmpld	cr1,rC,rD
    337	bne	cr6,.LcmpEF
    338
    339	cmpld	cr6,rE,rF
    340	bne	cr7,.LcmpGH
    341
    342	cmpld	cr7,rG,rH
    343	bne	cr0,.LcmpAB
    344
    345	bne	cr1,.LcmpCD
    346	bne	cr6,.LcmpEF
    347	bne	cr7,.LcmpGH
    348
    349.Ltail:
    350	ld	r31,-8(r1)
    351	ld	r30,-16(r1)
    352	ld	r29,-24(r1)
    353	ld	r28,-32(r1)
    354	ld	r27,-40(r1)
    355
    356	cmpdi	r5,0
    357	beq	.Lzero
    358	b	.Lshort
    359
    360.Lfirst32:
    361	cmpld	cr1,rC,rD
    362	cmpld	cr6,rE,rF
    363	cmpld	cr7,rG,rH
    364
    365	bne	cr0,.LcmpAB
    366	bne	cr1,.LcmpCD
    367	bne	cr6,.LcmpEF
    368	bne	cr7,.LcmpGH
    369
    370	b	.Ltail
    371
    372.LcmpAB:
    373	li	r3,1
    374	bgt	cr0,.Lout
    375	li	r3,-1
    376	b	.Lout
    377
    378.LcmpCD:
    379	li	r3,1
    380	bgt	cr1,.Lout
    381	li	r3,-1
    382	b	.Lout
    383
    384.LcmpEF:
    385	li	r3,1
    386	bgt	cr6,.Lout
    387	li	r3,-1
    388	b	.Lout
    389
    390.LcmpGH:
    391	li	r3,1
    392	bgt	cr7,.Lout
    393	li	r3,-1
    394
    395.Lout:
    396	ld	r31,-8(r1)
    397	ld	r30,-16(r1)
    398	ld	r29,-24(r1)
    399	ld	r28,-32(r1)
    400	ld	r27,-40(r1)
    401	blr
    402
    403.LcmpAB_lightweight:   /* skip NV GPRS restore */
    404	li	r3,1
    405	bgtlr
    406	li	r3,-1
    407	blr
    408
    409#ifdef CONFIG_ALTIVEC
    410.Lsameoffset_vmx_cmp:
    411	/* Enter with src/dst addrs has the same offset with 8 bytes
    412	 * align boundary.
    413	 *
    414	 * There is an optimization based on following fact: memcmp()
    415	 * prones to fail early at the first 32 bytes.
    416	 * Before applying VMX instructions which will lead to 32x128bits
    417	 * VMX regs load/restore penalty, we compare the first 32 bytes
    418	 * so that we can catch the ~80% fail cases.
    419	 */
    420
    421	li	r0,4
    422	mtctr	r0
    423.Lsameoffset_prechk_32B_loop:
    424	LD	rA,0,r3
    425	LD	rB,0,r4
    426	cmpld	cr0,rA,rB
    427	addi	r3,r3,8
    428	addi	r4,r4,8
    429	bne     cr0,.LcmpAB_lightweight
    430	addi	r5,r5,-8
    431	bdnz	.Lsameoffset_prechk_32B_loop
    432
    433	ENTER_VMX_OPS
    434	beq     cr1,.Llong_novmx_cmp
    435
    4363:
    437	/* need to check whether r4 has the same offset with r3
    438	 * for 16 bytes boundary.
    439	 */
    440	xor	r0,r3,r4
    441	andi.	r0,r0,0xf
    442	bne	.Ldiffoffset_vmx_cmp_start
    443
    444	/* len is no less than 4KB. Need to align with 16 bytes further.
    445	 */
    446	andi.	rA,r3,8
    447	LD	rA,0,r3
    448	beq	4f
    449	LD	rB,0,r4
    450	cmpld	cr0,rA,rB
    451	addi	r3,r3,8
    452	addi	r4,r4,8
    453	addi	r5,r5,-8
    454
    455	beq	cr0,4f
    456	/* save and restore cr0 */
    457	mfocrf  r5,128
    458	EXIT_VMX_OPS
    459	mtocrf  128,r5
    460	b	.LcmpAB_lightweight
    461
    4624:
    463	/* compare 32 bytes for each loop */
    464	srdi	r0,r5,5
    465	mtctr	r0
    466	clrldi  r5,r5,59
    467	li	off16,16
    468
    469.balign 16
    4705:
    471	lvx 	v0,0,r3
    472	lvx 	v1,0,r4
    473	VCMPEQUD_RC(v0,v0,v1)
    474	bnl	cr6,7f
    475	lvx 	v0,off16,r3
    476	lvx 	v1,off16,r4
    477	VCMPEQUD_RC(v0,v0,v1)
    478	bnl	cr6,6f
    479	addi	r3,r3,32
    480	addi	r4,r4,32
    481	bdnz	5b
    482
    483	EXIT_VMX_OPS
    484	cmpdi	r5,0
    485	beq	.Lzero
    486	b	.Lcmp_lt32bytes
    487
    4886:
    489	addi	r3,r3,16
    490	addi	r4,r4,16
    491
    4927:
    493	/* diff the last 16 bytes */
    494	EXIT_VMX_OPS
    495	LD	rA,0,r3
    496	LD	rB,0,r4
    497	cmpld	cr0,rA,rB
    498	li	off8,8
    499	bne	cr0,.LcmpAB_lightweight
    500
    501	LD	rA,off8,r3
    502	LD	rB,off8,r4
    503	cmpld	cr0,rA,rB
    504	bne	cr0,.LcmpAB_lightweight
    505	b	.Lzero
    506#endif
    507
    508.Ldiffoffset_8bytes_make_align_start:
    509	/* now try to align s1 with 8 bytes */
    510	rlwinm  r6,r3,3,26,28
    511	beq     .Ldiffoffset_align_s1_8bytes
    512
    513	clrrdi	r3,r3,3
    514	LD	rA,0,r3
    515	LD	rB,0,r4  /* unaligned load */
    516	sld	rA,rA,r6
    517	srd	rA,rA,r6
    518	srd	rB,rB,r6
    519	cmpld	cr0,rA,rB
    520	srwi	r6,r6,3
    521	bne	cr0,.LcmpAB_lightweight
    522
    523	subfic  r6,r6,8
    524	subf.	r5,r6,r5
    525	addi	r3,r3,8
    526	add	r4,r4,r6
    527
    528	beq	.Lzero
    529
    530.Ldiffoffset_align_s1_8bytes:
    531	/* now s1 is aligned with 8 bytes. */
    532#ifdef CONFIG_ALTIVEC
    533BEGIN_FTR_SECTION
    534	/* only do vmx ops when the size equal or greater than 4K bytes */
    535	cmpdi	cr5,r5,VMX_THRESH
    536	bge	cr5,.Ldiffoffset_vmx_cmp
    537END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
    538
    539.Ldiffoffset_novmx_cmp:
    540#endif
    541
    542
    543	cmpdi   cr5,r5,31
    544	ble	cr5,.Lcmp_lt32bytes
    545
    546#ifdef CONFIG_ALTIVEC
    547	b	.Llong_novmx_cmp
    548#else
    549	b	.Llong
    550#endif
    551
    552#ifdef CONFIG_ALTIVEC
    553.Ldiffoffset_vmx_cmp:
    554	/* perform a 32 bytes pre-checking before
    555	 * enable VMX operations.
    556	 */
    557	li	r0,4
    558	mtctr	r0
    559.Ldiffoffset_prechk_32B_loop:
    560	LD	rA,0,r3
    561	LD	rB,0,r4
    562	cmpld	cr0,rA,rB
    563	addi	r3,r3,8
    564	addi	r4,r4,8
    565	bne     cr0,.LcmpAB_lightweight
    566	addi	r5,r5,-8
    567	bdnz	.Ldiffoffset_prechk_32B_loop
    568
    569	ENTER_VMX_OPS
    570	beq     cr1,.Ldiffoffset_novmx_cmp
    571
    572.Ldiffoffset_vmx_cmp_start:
    573	/* Firstly try to align r3 with 16 bytes */
    574	andi.   r6,r3,0xf
    575	li	off16,16
    576	beq     .Ldiffoffset_vmx_s1_16bytes_align
    577
    578	LVS	v3,0,r3
    579	LVS	v4,0,r4
    580
    581	lvx     v5,0,r3
    582	lvx     v6,0,r4
    583	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
    584	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
    585
    586	VCMPEQUB_RC(v7,v9,v10)
    587	bnl	cr6,.Ldiffoffset_vmx_diff_found
    588
    589	subfic  r6,r6,16
    590	subf    r5,r6,r5
    591	add     r3,r3,r6
    592	add     r4,r4,r6
    593
    594.Ldiffoffset_vmx_s1_16bytes_align:
    595	/* now s1 is aligned with 16 bytes */
    596	lvx     v6,0,r4
    597	LVS	v4,0,r4
    598	srdi	r6,r5,5  /* loop for 32 bytes each */
    599	clrldi  r5,r5,59
    600	mtctr	r6
    601
    602.balign	16
    603.Ldiffoffset_vmx_32bytesloop:
    604	/* the first qw of r4 was saved in v6 */
    605	lvx	v9,0,r3
    606	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
    607	VCMPEQUB_RC(v7,v9,v10)
    608	vor	v6,v8,v8
    609	bnl	cr6,.Ldiffoffset_vmx_diff_found
    610
    611	addi	r3,r3,16
    612	addi	r4,r4,16
    613
    614	lvx	v9,0,r3
    615	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
    616	VCMPEQUB_RC(v7,v9,v10)
    617	vor	v6,v8,v8
    618	bnl	cr6,.Ldiffoffset_vmx_diff_found
    619
    620	addi	r3,r3,16
    621	addi	r4,r4,16
    622
    623	bdnz	.Ldiffoffset_vmx_32bytesloop
    624
    625	EXIT_VMX_OPS
    626
    627	cmpdi	r5,0
    628	beq	.Lzero
    629	b	.Lcmp_lt32bytes
    630
    631.Ldiffoffset_vmx_diff_found:
    632	EXIT_VMX_OPS
    633	/* anyway, the diff will appear in next 16 bytes */
    634	li	r5,16
    635	b	.Lcmp_lt32bytes
    636
    637#endif
    638EXPORT_SYMBOL(memcmp)