ev6-memchr.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
ev6-memchr.S (5444B)
      1/* SPDX-License-Identifier: GPL-2.0 */
      2/*
      3 * arch/alpha/lib/ev6-memchr.S
      4 *
      5 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
      6 *
      7 * Finds characters in a memory area.  Optimized for the Alpha:
      8 *
      9 *    - memory accessed as aligned quadwords only
     10 *    - uses cmpbge to compare 8 bytes in parallel
     11 *    - does binary search to find 0 byte in last
     12 *      quadword (HAKMEM needed 12 instructions to
     13 *      do this instead of the 9 instructions that
     14 *      binary search needs).
     15 *
     16 * For correctness consider that:
     17 *
     18 *    - only minimum number of quadwords may be accessed
     19 *    - the third argument is an unsigned long
     20 *
     21 * Much of the information about 21264 scheduling/coding comes from:
     22 *	Compiler Writer's Guide for the Alpha 21264
     23 *	abbreviated as 'CWG' in other comments here
     24 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
     25 * Scheduling notation:
     26 *	E	- either cluster
     27 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
     28 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
     29 * Try not to change the actual algorithm if possible for consistency.
     30 */
     31#include <asm/export.h>
     32        .set noreorder
     33        .set noat
     34
     35	.align	4
     36	.globl memchr
     37	.ent memchr
     38memchr:
     39	.frame $30,0,$26,0
     40	.prologue 0
     41
     42	# Hack -- if someone passes in (size_t)-1, hoping to just
     43	# search til the end of the address space, we will overflow
     44	# below when we find the address of the last byte.  Given
     45	# that we will never have a 56-bit address space, cropping
     46	# the length is the easiest way to avoid trouble.
     47	zap	$18, 0x80, $5	# U : Bound length
     48	beq	$18, $not_found	# U :
     49        ldq_u   $1, 0($16)	# L : load first quadword Latency=3
     50	and	$17, 0xff, $17	# E : L L U U : 00000000000000ch
     51
     52	insbl	$17, 1, $2	# U : 000000000000ch00
     53	cmpult	$18, 9, $4	# E : small (< 1 quad) string?
     54	or	$2, $17, $17	# E : 000000000000chch
     55        lda     $3, -1($31)	# E : U L L U
     56
     57	sll	$17, 16, $2	# U : 00000000chch0000
     58	addq	$16, $5, $5	# E : Max search address
     59	or	$2, $17, $17	# E : 00000000chchchch
     60	sll	$17, 32, $2	# U : U L L U : chchchch00000000
     61
     62	or	$2, $17, $17	# E : chchchchchchchch
     63	extql	$1, $16, $7	# U : $7 is upper bits
     64	beq	$4, $first_quad	# U :
     65	ldq_u	$6, -1($5)	# L : L U U L : eight or less bytes to search Latency=3
     66
     67	extqh	$6, $16, $6	# U : 2 cycle stall for $6
     68	mov	$16, $0		# E :
     69	nop			# E :
     70	or	$7, $6, $1	# E : L U L U $1 = quadword starting at $16
     71
     72	# Deal with the case where at most 8 bytes remain to be searched
     73	# in $1.  E.g.:
     74	#	$18 = 6
     75	#	$1 = ????c6c5c4c3c2c1
     76$last_quad:
     77	negq	$18, $6		# E :
     78        xor	$17, $1, $1	# E :
     79	srl	$3, $6, $6	# U : $6 = mask of $18 bits set
     80        cmpbge  $31, $1, $2	# E : L U L U
     81
     82	nop
     83	nop
     84	and	$2, $6, $2	# E :
     85        beq     $2, $not_found	# U : U L U L
     86
     87$found_it:
     88#ifdef CONFIG_ALPHA_EV67
     89	/*
     90	 * Since we are guaranteed to have set one of the bits, we don't
     91	 * have to worry about coming back with a 0x40 out of cttz...
     92	 */
     93	cttz	$2, $3		# U0 :
     94	addq	$0, $3, $0	# E : All done
     95	nop			# E :
     96	ret			# L0 : L U L U
     97#else
     98	/*
     99	 * Slow and clunky.  It can probably be improved.
    100	 * An exercise left for others.
    101	 */
    102        negq    $2, $3		# E :
    103        and     $2, $3, $2	# E :
    104        and     $2, 0x0f, $1	# E :
    105        addq    $0, 4, $3	# E :
    106
    107        cmoveq  $1, $3, $0	# E : Latency 2, extra map cycle
    108	nop			# E : keep with cmov
    109        and     $2, 0x33, $1	# E :
    110        addq    $0, 2, $3	# E : U L U L : 2 cycle stall on $0
    111
    112        cmoveq  $1, $3, $0	# E : Latency 2, extra map cycle
    113	nop			# E : keep with cmov
    114        and     $2, 0x55, $1	# E :
    115        addq    $0, 1, $3	# E : U L U L : 2 cycle stall on $0
    116
    117        cmoveq  $1, $3, $0	# E : Latency 2, extra map cycle
    118	nop
    119	nop
    120	ret			# L0 : L U L U
    121#endif
    122
    123	# Deal with the case where $18 > 8 bytes remain to be
    124	# searched.  $16 may not be aligned.
    125	.align 4
    126$first_quad:
    127	andnot	$16, 0x7, $0	# E :
    128        insqh   $3, $16, $2	# U : $2 = 0000ffffffffffff ($16<0:2> ff)
    129        xor	$1, $17, $1	# E :
    130	or	$1, $2, $1	# E : U L U L $1 = ====ffffffffffff
    131
    132        cmpbge  $31, $1, $2	# E :
    133        bne     $2, $found_it	# U :
    134	# At least one byte left to process.
    135	ldq	$1, 8($0)	# L :
    136	subq	$5, 1, $18	# E : U L U L
    137
    138	addq	$0, 8, $0	# E :
    139	# Make $18 point to last quad to be accessed (the
    140	# last quad may or may not be partial).
    141	andnot	$18, 0x7, $18	# E :
    142	cmpult	$0, $18, $2	# E :
    143	beq	$2, $final	# U : U L U L
    144
    145	# At least two quads remain to be accessed.
    146
    147	subq	$18, $0, $4	# E : $4 <- nr quads to be processed
    148	and	$4, 8, $4	# E : odd number of quads?
    149	bne	$4, $odd_quad_count # U :
    150	# At least three quads remain to be accessed
    151	mov	$1, $4		# E : L U L U : move prefetched value to correct reg
    152
    153	.align	4
    154$unrolled_loop:
    155	ldq	$1, 8($0)	# L : prefetch $1
    156	xor	$17, $4, $2	# E :
    157	cmpbge	$31, $2, $2	# E :
    158	bne	$2, $found_it	# U : U L U L
    159
    160	addq	$0, 8, $0	# E :
    161	nop			# E :
    162	nop			# E :
    163	nop			# E :
    164
    165$odd_quad_count:
    166	xor	$17, $1, $2	# E :
    167	ldq	$4, 8($0)	# L : prefetch $4
    168	cmpbge	$31, $2, $2	# E :
    169	addq	$0, 8, $6	# E :
    170
    171	bne	$2, $found_it	# U :
    172	cmpult	$6, $18, $6	# E :
    173	addq	$0, 8, $0	# E :
    174	nop			# E :
    175
    176	bne	$6, $unrolled_loop # U :
    177	mov	$4, $1		# E : move prefetched value into $1
    178	nop			# E :
    179	nop			# E :
    180
    181$final:	subq	$5, $0, $18	# E : $18 <- number of bytes left to do
    182	nop			# E :
    183	nop			# E :
    184	bne	$18, $last_quad	# U :
    185
    186$not_found:
    187	mov	$31, $0		# E :
    188	nop			# E :
    189	nop			# E :
    190	ret			# L0 :
    191
    192        .end memchr
    193	EXPORT_SYMBOL(memchr)