cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

octeon-memcpy.S (12617B)


      1/*
      2 * This file is subject to the terms and conditions of the GNU General Public
      3 * License.  See the file "COPYING" in the main directory of this archive
      4 * for more details.
      5 *
      6 * Unified implementation of memcpy, memmove and the __copy_user backend.
      7 *
      8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
      9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
     10 * Copyright (C) 2002 Broadcom, Inc.
     11 *   memcpy/copy_user author: Mark Vandevoorde
     12 *
     13 * Mnemonic names for arguments to memcpy/__copy_user
     14 */
     15
     16#include <asm/asm.h>
     17#include <asm/asm-offsets.h>
     18#include <asm/export.h>
     19#include <asm/regdef.h>
     20
     21#define dst a0
     22#define src a1
     23#define len a2
     24
     25/*
     26 * Spec
     27 *
     28 * memcpy copies len bytes from src to dst and sets v0 to dst.
     29 * It assumes that
     30 *   - src and dst don't overlap
     31 *   - src is readable
     32 *   - dst is writable
     33 * memcpy uses the standard calling convention
     34 *
     35 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
     36 * the number of uncopied bytes due to an exception caused by a read or write.
     37 * __copy_user assumes that src and dst don't overlap, and that the call is
     38 * implementing one of the following:
     39 *   copy_to_user
     40 *     - src is readable  (no exceptions when reading src)
     41 *   copy_from_user
     42 *     - dst is writable  (no exceptions when writing dst)
     43 * __copy_user uses a non-standard calling convention; see
     44 * arch/mips/include/asm/uaccess.h
     45 *
     46 * When an exception happens on a load, the handler must
     47 # ensure that all of the destination buffer is overwritten to prevent
     48 * leaking information to user mode programs.
     49 */
     50
     51/*
     52 * Implementation
     53 */
     54
     55/*
     56 * The exception handler for loads requires that:
     57 *  1- AT contain the address of the byte just past the end of the source
     58 *     of the copy,
     59 *  2- src_entry <= src < AT, and
     60 *  3- (dst - src) == (dst_entry - src_entry),
     61 * The _entry suffix denotes values when __copy_user was called.
     62 *
     63 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
     64 * (2) is met by incrementing src by the number of bytes copied
     65 * (3) is met by not doing loads between a pair of increments of dst and src
     66 *
     67 * The exception handlers for stores adjust len (if necessary) and return.
     68 * These handlers do not need to overwrite any data.
     69 *
     70 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
     71 * they're not protected.
     72 */
     73
     74#define EXC(inst_reg,addr,handler)		\
     759:	inst_reg, addr;				\
     76	.section __ex_table,"a";		\
     77	PTR_WD	9b, handler;			\
     78	.previous
     79
     80/*
     81 * Only on the 64-bit kernel we can made use of 64-bit registers.
     82 */
     83
     84#define LOAD   ld
     85#define LOADL  ldl
     86#define LOADR  ldr
     87#define STOREL sdl
     88#define STORER sdr
     89#define STORE  sd
     90#define ADD    daddu
     91#define SUB    dsubu
     92#define SRL    dsrl
     93#define SRA    dsra
     94#define SLL    dsll
     95#define SLLV   dsllv
     96#define SRLV   dsrlv
     97#define NBYTES 8
     98#define LOG_NBYTES 3
     99
    100/*
    101 * As we are sharing code base with the mips32 tree (which use the o32 ABI
    102 * register definitions). We need to redefine the register definitions from
    103 * the n64 ABI register naming to the o32 ABI register naming.
    104 */
    105#undef t0
    106#undef t1
    107#undef t2
    108#undef t3
    109#define t0	$8
    110#define t1	$9
    111#define t2	$10
    112#define t3	$11
    113#define t4	$12
    114#define t5	$13
    115#define t6	$14
    116#define t7	$15
    117
    118#ifdef CONFIG_CPU_LITTLE_ENDIAN
    119#define LDFIRST LOADR
    120#define LDREST	LOADL
    121#define STFIRST STORER
    122#define STREST	STOREL
    123#define SHIFT_DISCARD SLLV
    124#else
    125#define LDFIRST LOADL
    126#define LDREST	LOADR
    127#define STFIRST STOREL
    128#define STREST	STORER
    129#define SHIFT_DISCARD SRLV
    130#endif
    131
    132#define FIRST(unit) ((unit)*NBYTES)
    133#define REST(unit)  (FIRST(unit)+NBYTES-1)
    134#define UNIT(unit)  FIRST(unit)
    135
    136#define ADDRMASK (NBYTES-1)
    137
    138	.text
    139	.set	noreorder
    140	.set	noat
    141
    142/*
    143 * A combined memcpy/__copy_user
    144 * __copy_user sets len to 0 for success; else to an upper bound of
    145 * the number of uncopied bytes.
    146 * memcpy sets v0 to dst.
    147 */
    148	.align	5
    149LEAF(memcpy)					/* a0=dst a1=src a2=len */
    150EXPORT_SYMBOL(memcpy)
    151	move	v0, dst				/* return value */
    152__memcpy:
    153FEXPORT(__raw_copy_from_user)
    154EXPORT_SYMBOL(__raw_copy_from_user)
    155FEXPORT(__raw_copy_to_user)
    156EXPORT_SYMBOL(__raw_copy_to_user)
    157	/*
    158	 * Note: dst & src may be unaligned, len may be 0
    159	 * Temps
    160	 */
    161	#
    162	# Octeon doesn't care if the destination is unaligned. The hardware
    163	# can fix it faster than we can special case the assembly.
    164	#
    165	pref	0, 0(src)
    166	sltu	t0, len, NBYTES		# Check if < 1 word
    167	bnez	t0, copy_bytes_checklen
    168	 and	t0, src, ADDRMASK	# Check if src unaligned
    169	bnez	t0, src_unaligned
    170	 sltu	t0, len, 4*NBYTES	# Check if < 4 words
    171	bnez	t0, less_than_4units
    172	 sltu	t0, len, 8*NBYTES	# Check if < 8 words
    173	bnez	t0, less_than_8units
    174	 sltu	t0, len, 16*NBYTES	# Check if < 16 words
    175	bnez	t0, cleanup_both_aligned
    176	 sltu	t0, len, 128+1		# Check if len < 129
    177	bnez	t0, 1f			# Skip prefetch if len is too short
    178	 sltu	t0, len, 256+1		# Check if len < 257
    179	bnez	t0, 1f			# Skip prefetch if len is too short
    180	 pref	0, 128(src)		# We must not prefetch invalid addresses
    181	#
    182	# This is where we loop if there is more than 128 bytes left
    1832:	pref	0, 256(src)		# We must not prefetch invalid addresses
    184	#
    185	# This is where we loop if we can't prefetch anymore
    1861:
    187EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
    188EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
    189EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
    190EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
    191	SUB	len, len, 16*NBYTES
    192EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p16u)
    193EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p15u)
    194EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p14u)
    195EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p13u)
    196EXC(	LOAD	t0, UNIT(4)(src),	l_exc_copy)
    197EXC(	LOAD	t1, UNIT(5)(src),	l_exc_copy)
    198EXC(	LOAD	t2, UNIT(6)(src),	l_exc_copy)
    199EXC(	LOAD	t3, UNIT(7)(src),	l_exc_copy)
    200EXC(	STORE	t0, UNIT(4)(dst),	s_exc_p12u)
    201EXC(	STORE	t1, UNIT(5)(dst),	s_exc_p11u)
    202EXC(	STORE	t2, UNIT(6)(dst),	s_exc_p10u)
    203	ADD	src, src, 16*NBYTES
    204EXC(	STORE	t3, UNIT(7)(dst),	s_exc_p9u)
    205	ADD	dst, dst, 16*NBYTES
    206EXC(	LOAD	t0, UNIT(-8)(src),	l_exc_copy_rewind16)
    207EXC(	LOAD	t1, UNIT(-7)(src),	l_exc_copy_rewind16)
    208EXC(	LOAD	t2, UNIT(-6)(src),	l_exc_copy_rewind16)
    209EXC(	LOAD	t3, UNIT(-5)(src),	l_exc_copy_rewind16)
    210EXC(	STORE	t0, UNIT(-8)(dst),	s_exc_p8u)
    211EXC(	STORE	t1, UNIT(-7)(dst),	s_exc_p7u)
    212EXC(	STORE	t2, UNIT(-6)(dst),	s_exc_p6u)
    213EXC(	STORE	t3, UNIT(-5)(dst),	s_exc_p5u)
    214EXC(	LOAD	t0, UNIT(-4)(src),	l_exc_copy_rewind16)
    215EXC(	LOAD	t1, UNIT(-3)(src),	l_exc_copy_rewind16)
    216EXC(	LOAD	t2, UNIT(-2)(src),	l_exc_copy_rewind16)
    217EXC(	LOAD	t3, UNIT(-1)(src),	l_exc_copy_rewind16)
    218EXC(	STORE	t0, UNIT(-4)(dst),	s_exc_p4u)
    219EXC(	STORE	t1, UNIT(-3)(dst),	s_exc_p3u)
    220EXC(	STORE	t2, UNIT(-2)(dst),	s_exc_p2u)
    221EXC(	STORE	t3, UNIT(-1)(dst),	s_exc_p1u)
    222	sltu	t0, len, 256+1		# See if we can prefetch more
    223	beqz	t0, 2b
    224	 sltu	t0, len, 128		# See if we can loop more time
    225	beqz	t0, 1b
    226	 nop
    227	#
    228	# Jump here if there are less than 16*NBYTES left.
    229	#
    230cleanup_both_aligned:
    231	beqz	len, done
    232	 sltu	t0, len, 8*NBYTES
    233	bnez	t0, less_than_8units
    234	 nop
    235EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
    236EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
    237EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
    238EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
    239	SUB	len, len, 8*NBYTES
    240EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p8u)
    241EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p7u)
    242EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p6u)
    243EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p5u)
    244EXC(	LOAD	t0, UNIT(4)(src),	l_exc_copy)
    245EXC(	LOAD	t1, UNIT(5)(src),	l_exc_copy)
    246EXC(	LOAD	t2, UNIT(6)(src),	l_exc_copy)
    247EXC(	LOAD	t3, UNIT(7)(src),	l_exc_copy)
    248EXC(	STORE	t0, UNIT(4)(dst),	s_exc_p4u)
    249EXC(	STORE	t1, UNIT(5)(dst),	s_exc_p3u)
    250EXC(	STORE	t2, UNIT(6)(dst),	s_exc_p2u)
    251EXC(	STORE	t3, UNIT(7)(dst),	s_exc_p1u)
    252	ADD	src, src, 8*NBYTES
    253	beqz	len, done
    254	 ADD	dst, dst, 8*NBYTES
    255	#
    256	# Jump here if there are less than 8*NBYTES left.
    257	#
    258less_than_8units:
    259	sltu	t0, len, 4*NBYTES
    260	bnez	t0, less_than_4units
    261	 nop
    262EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
    263EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
    264EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
    265EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
    266	SUB	len, len, 4*NBYTES
    267EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
    268EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
    269EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
    270EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
    271	ADD	src, src, 4*NBYTES
    272	beqz	len, done
    273	 ADD	dst, dst, 4*NBYTES
    274	#
    275	# Jump here if there are less than 4*NBYTES left. This means
    276	# we may need to copy up to 3 NBYTES words.
    277	#
    278less_than_4units:
    279	sltu	t0, len, 1*NBYTES
    280	bnez	t0, copy_bytes_checklen
    281	 nop
    282	#
    283	# 1) Copy NBYTES, then check length again
    284	#
    285EXC(	LOAD	t0, 0(src),		l_exc)
    286	SUB	len, len, NBYTES
    287	sltu	t1, len, 8
    288EXC(	STORE	t0, 0(dst),		s_exc_p1u)
    289	ADD	src, src, NBYTES
    290	bnez	t1, copy_bytes_checklen
    291	 ADD	dst, dst, NBYTES
    292	#
    293	# 2) Copy NBYTES, then check length again
    294	#
    295EXC(	LOAD	t0, 0(src),		l_exc)
    296	SUB	len, len, NBYTES
    297	sltu	t1, len, 8
    298EXC(	STORE	t0, 0(dst),		s_exc_p1u)
    299	ADD	src, src, NBYTES
    300	bnez	t1, copy_bytes_checklen
    301	 ADD	dst, dst, NBYTES
    302	#
    303	# 3) Copy NBYTES, then check length again
    304	#
    305EXC(	LOAD	t0, 0(src),		l_exc)
    306	SUB	len, len, NBYTES
    307	ADD	src, src, NBYTES
    308	ADD	dst, dst, NBYTES
    309	b copy_bytes_checklen
    310EXC(	 STORE	t0, -8(dst),		s_exc_p1u)
    311
    312src_unaligned:
    313#define rem t8
    314	SRL	t0, len, LOG_NBYTES+2	 # +2 for 4 units/iter
    315	beqz	t0, cleanup_src_unaligned
    316	 and	rem, len, (4*NBYTES-1)	 # rem = len % 4*NBYTES
    3171:
    318/*
    319 * Avoid consecutive LD*'s to the same register since some mips
    320 * implementations can't issue them in the same cycle.
    321 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
    322 * are to the same unit (unless src is aligned, but it's not).
    323 */
    324EXC(	LDFIRST t0, FIRST(0)(src),	l_exc)
    325EXC(	LDFIRST t1, FIRST(1)(src),	l_exc_copy)
    326	SUB	len, len, 4*NBYTES
    327EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
    328EXC(	LDREST	t1, REST(1)(src),	l_exc_copy)
    329EXC(	LDFIRST t2, FIRST(2)(src),	l_exc_copy)
    330EXC(	LDFIRST t3, FIRST(3)(src),	l_exc_copy)
    331EXC(	LDREST	t2, REST(2)(src),	l_exc_copy)
    332EXC(	LDREST	t3, REST(3)(src),	l_exc_copy)
    333	ADD	src, src, 4*NBYTES
    334EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
    335EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
    336EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
    337EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
    338	bne	len, rem, 1b
    339	 ADD	dst, dst, 4*NBYTES
    340
    341cleanup_src_unaligned:
    342	beqz	len, done
    343	 and	rem, len, NBYTES-1  # rem = len % NBYTES
    344	beq	rem, len, copy_bytes
    345	 nop
    3461:
    347EXC(	LDFIRST t0, FIRST(0)(src),	l_exc)
    348EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
    349	SUB	len, len, NBYTES
    350EXC(	STORE	t0, 0(dst),		s_exc_p1u)
    351	ADD	src, src, NBYTES
    352	bne	len, rem, 1b
    353	 ADD	dst, dst, NBYTES
    354
    355copy_bytes_checklen:
    356	beqz	len, done
    357	 nop
    358copy_bytes:
    359	/* 0 < len < NBYTES  */
    360#define COPY_BYTE(N)			\
    361EXC(	lb	t0, N(src), l_exc);	\
    362	SUB	len, len, 1;		\
    363	beqz	len, done;		\
    364EXC(	 sb	t0, N(dst), s_exc_p1)
    365
    366	COPY_BYTE(0)
    367	COPY_BYTE(1)
    368	COPY_BYTE(2)
    369	COPY_BYTE(3)
    370	COPY_BYTE(4)
    371	COPY_BYTE(5)
    372EXC(	lb	t0, NBYTES-2(src), l_exc)
    373	SUB	len, len, 1
    374	jr	ra
    375EXC(	 sb	t0, NBYTES-2(dst), s_exc_p1)
    376done:
    377	jr	ra
    378	 nop
    379	END(memcpy)
    380
    381l_exc_copy_rewind16:
    382	/* Rewind src and dst by 16*NBYTES for l_exc_copy */
    383	SUB	src, src, 16*NBYTES
    384	SUB	dst, dst, 16*NBYTES
    385l_exc_copy:
    386	/*
    387	 * Copy bytes from src until faulting load address (or until a
    388	 * lb faults)
    389	 *
    390	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
    391	 * may be more than a byte beyond the last address.
    392	 * Hence, the lb below may get an exception.
    393	 *
    394	 * Assumes src < THREAD_BUADDR($28)
    395	 */
    396	LOAD	t0, TI_TASK($28)
    397	LOAD	t0, THREAD_BUADDR(t0)
    3981:
    399EXC(	lb	t1, 0(src),	l_exc)
    400	ADD	src, src, 1
    401	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
    402	bne	src, t0, 1b
    403	 ADD	dst, dst, 1
    404l_exc:
    405	LOAD	t0, TI_TASK($28)
    406	LOAD	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
    407	SUB	len, AT, t0		# len number of uncopied bytes
    408	jr	ra
    409	 nop
    410
    411
    412#define SEXC(n)				\
    413s_exc_p ## n ## u:			\
    414	jr	ra;			\
    415	 ADD	len, len, n*NBYTES
    416
    417SEXC(16)
    418SEXC(15)
    419SEXC(14)
    420SEXC(13)
    421SEXC(12)
    422SEXC(11)
    423SEXC(10)
    424SEXC(9)
    425SEXC(8)
    426SEXC(7)
    427SEXC(6)
    428SEXC(5)
    429SEXC(4)
    430SEXC(3)
    431SEXC(2)
    432SEXC(1)
    433
    434s_exc_p1:
    435	jr	ra
    436	 ADD	len, len, 1
    437s_exc:
    438	jr	ra
    439	 nop
    440
    441	.align	5
    442LEAF(memmove)
    443EXPORT_SYMBOL(memmove)
    444	ADD	t0, a0, a2
    445	ADD	t1, a1, a2
    446	sltu	t0, a1, t0			# dst + len <= src -> memcpy
    447	sltu	t1, a0, t1			# dst >= src + len -> memcpy
    448	and	t0, t1
    449	beqz	t0, __memcpy
    450	 move	v0, a0				/* return value */
    451	beqz	a2, r_out
    452	END(memmove)
    453
    454	/* fall through to __rmemcpy */
    455LEAF(__rmemcpy)					/* a0=dst a1=src a2=len */
    456	 sltu	t0, a1, a0
    457	beqz	t0, r_end_bytes_up		# src >= dst
    458	 nop
    459	ADD	a0, a2				# dst = dst + len
    460	ADD	a1, a2				# src = src + len
    461
    462r_end_bytes:
    463	lb	t0, -1(a1)
    464	SUB	a2, a2, 0x1
    465	sb	t0, -1(a0)
    466	SUB	a1, a1, 0x1
    467	bnez	a2, r_end_bytes
    468	 SUB	a0, a0, 0x1
    469
    470r_out:
    471	jr	ra
    472	 move	a2, zero
    473
    474r_end_bytes_up:
    475	lb	t0, (a1)
    476	SUB	a2, a2, 0x1
    477	sb	t0, (a0)
    478	ADD	a1, a1, 0x1
    479	bnez	a2, r_end_bytes_up
    480	 ADD	a0, a0, 0x1
    481
    482	jr	ra
    483	 move	a2, zero
    484	END(__rmemcpy)