cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

memcopy.S (12694B)


      1/*
      2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
      3 * xthal_memcpy and xthal_bcopy
      4 *
      5 * This file is subject to the terms and conditions of the GNU General Public
      6 * License.  See the file "COPYING" in the main directory of this archive
      7 * for more details.
      8 *
      9 * Copyright (C) 2002 - 2012 Tensilica Inc.
     10 */
     11
     12#include <linux/linkage.h>
     13#include <asm/asmmacro.h>
     14#include <asm/core.h>
     15
     16/*
     17 * void *memcpy(void *dst, const void *src, size_t len);
     18 *
     19 * This function is intended to do the same thing as the standard
     20 * library function memcpy() for most cases.
     21 * However, where the source and/or destination references
     22 * an instruction RAM or ROM or a data RAM or ROM, that
     23 * source and/or destination will always be accessed with
     24 * 32-bit load and store instructions (as required for these
     25 * types of devices).
     26 *
     27 * !!!!!!!  XTFIXME:
     28 * !!!!!!!  Handling of IRAM/IROM has not yet
     29 * !!!!!!!  been implemented.
     30 *
     31 * The (general case) algorithm is as follows:
     32 *   If destination is unaligned, align it by conditionally
     33 *     copying 1 and 2 bytes.
     34 *   If source is aligned,
     35 *     do 16 bytes with a loop, and then finish up with
     36 *     8, 4, 2, and 1 byte copies conditional on the length;
     37 *   else (if source is unaligned),
     38 *     do the same, but use SRC to align the source data.
     39 *   This code tries to use fall-through branches for the common
     40 *     case of aligned source and destination and multiple
     41 *     of 4 (or 8) length.
     42 *
     43 * Register use:
     44 *	a0/ return address
     45 *	a1/ stack pointer
     46 *	a2/ return value
     47 *	a3/ src
     48 *	a4/ length
     49 *	a5/ dst
     50 *	a6/ tmp
     51 *	a7/ tmp
     52 *	a8/ tmp
     53 *	a9/ tmp
     54 *	a10/ tmp
     55 *	a11/ tmp
     56 */
     57
     58	.text
     59
     60/*
     61 * Byte by byte copy
     62 */
     63	.align	4
     64	.byte	0		# 1 mod 4 alignment for LOOPNEZ
     65				# (0 mod 4 alignment for LBEG)
     66.Lbytecopy:
     67#if XCHAL_HAVE_LOOPS
     68	loopnez	a4, .Lbytecopydone
     69#else /* !XCHAL_HAVE_LOOPS */
     70	beqz	a4, .Lbytecopydone
     71	add	a7, a3, a4	# a7 = end address for source
     72#endif /* !XCHAL_HAVE_LOOPS */
     73.Lnextbyte:
     74	l8ui	a6, a3, 0
     75	addi	a3, a3, 1
     76	s8i	a6, a5, 0
     77	addi	a5, a5, 1
     78#if !XCHAL_HAVE_LOOPS
     79	bne	a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
     80#endif /* !XCHAL_HAVE_LOOPS */
     81.Lbytecopydone:
     82	abi_ret_default
     83
     84/*
     85 * Destination is unaligned
     86 */
     87
     88	.align	4
     89.Ldst1mod2:	# dst is only byte aligned
     90	_bltui	a4, 7, .Lbytecopy	# do short copies byte by byte
     91
     92	# copy 1 byte
     93	l8ui	a6, a3,  0
     94	addi	a3, a3,  1
     95	addi	a4, a4, -1
     96	s8i	a6, a5,  0
     97	addi	a5, a5,  1
     98	_bbci.l	a5, 1, .Ldstaligned	# if dst is now aligned, then
     99					# return to main algorithm
    100.Ldst2mod4:	# dst 16-bit aligned
    101	# copy 2 bytes
    102	_bltui	a4, 6, .Lbytecopy	# do short copies byte by byte
    103	l8ui	a6, a3,  0
    104	l8ui	a7, a3,  1
    105	addi	a3, a3,  2
    106	addi	a4, a4, -2
    107	s8i	a6, a5,  0
    108	s8i	a7, a5,  1
    109	addi	a5, a5,  2
    110	j	.Ldstaligned	# dst is now aligned, return to main algorithm
    111
    112ENTRY(__memcpy)
    113WEAK(memcpy)
    114
    115	abi_entry_default
    116	# a2/ dst, a3/ src, a4/ len
    117	mov	a5, a2		# copy dst so that a2 is return value
    118.Lcommon:
    119	_bbsi.l	a2, 0, .Ldst1mod2	# if dst is 1 mod 2
    120	_bbsi.l	a2, 1, .Ldst2mod4	# if dst is 2 mod 4
    121.Ldstaligned:	# return here from .Ldst?mod? once dst is aligned
    122	srli	a7, a4, 4	# number of loop iterations with 16B
    123				# per iteration
    124	movi	a8, 3		# if source is not aligned,
    125	_bany	a3, a8, .Lsrcunaligned	# then use shifting copy
    126	/*
    127	 * Destination and source are word-aligned, use word copy.
    128	 */
    129	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
    130#if XCHAL_HAVE_LOOPS
    131	loopnez	a7, .Loop1done
    132#else /* !XCHAL_HAVE_LOOPS */
    133	beqz	a7, .Loop1done
    134	slli	a8, a7, 4
    135	add	a8, a8, a3	# a8 = end of last 16B source chunk
    136#endif /* !XCHAL_HAVE_LOOPS */
    137.Loop1:
    138	l32i	a6, a3,  0
    139	l32i	a7, a3,  4
    140	s32i	a6, a5,  0
    141	l32i	a6, a3,  8
    142	s32i	a7, a5,  4
    143	l32i	a7, a3, 12
    144	s32i	a6, a5,  8
    145	addi	a3, a3, 16
    146	s32i	a7, a5, 12
    147	addi	a5, a5, 16
    148#if !XCHAL_HAVE_LOOPS
    149	bne	a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
    150#endif /* !XCHAL_HAVE_LOOPS */
    151.Loop1done:
    152	bbci.l	a4, 3, .L2
    153	# copy 8 bytes
    154	l32i	a6, a3,  0
    155	l32i	a7, a3,  4
    156	addi	a3, a3,  8
    157	s32i	a6, a5,  0
    158	s32i	a7, a5,  4
    159	addi	a5, a5,  8
    160.L2:
    161	bbsi.l	a4, 2, .L3
    162	bbsi.l	a4, 1, .L4
    163	bbsi.l	a4, 0, .L5
    164	abi_ret_default
    165.L3:
    166	# copy 4 bytes
    167	l32i	a6, a3,  0
    168	addi	a3, a3,  4
    169	s32i	a6, a5,  0
    170	addi	a5, a5,  4
    171	bbsi.l	a4, 1, .L4
    172	bbsi.l	a4, 0, .L5
    173	abi_ret_default
    174.L4:
    175	# copy 2 bytes
    176	l16ui	a6, a3,  0
    177	addi	a3, a3,  2
    178	s16i	a6, a5,  0
    179	addi	a5, a5,  2
    180	bbsi.l	a4, 0, .L5
    181	abi_ret_default
    182.L5:
    183	# copy 1 byte
    184	l8ui	a6, a3,  0
    185	s8i	a6, a5,  0
    186	abi_ret_default
    187
    188/*
    189 * Destination is aligned, Source is unaligned
    190 */
    191
    192	.align	4
    193.Lsrcunaligned:
    194	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
    195	# copy 16 bytes per iteration for word-aligned dst and unaligned src
    196	__ssa8	a3		# set shift amount from byte offset
    197
    198/* set to 1 when running on ISS (simulator) with the
    199   lint or ferret client, or 0 to save a few cycles */
    200#define SIM_CHECKS_ALIGNMENT	1
    201#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
    202	and	a11, a3, a8	# save unalignment offset for below
    203	sub	a3, a3, a11	# align a3
    204#endif
    205	l32i	a6, a3, 0	# load first word
    206#if XCHAL_HAVE_LOOPS
    207	loopnez	a7, .Loop2done
    208#else /* !XCHAL_HAVE_LOOPS */
    209	beqz	a7, .Loop2done
    210	slli	a10, a7, 4
    211	add	a10, a10, a3	# a10 = end of last 16B source chunk
    212#endif /* !XCHAL_HAVE_LOOPS */
    213.Loop2:
    214	l32i	a7, a3,  4
    215	l32i	a8, a3,  8
    216	__src_b	a6, a6, a7
    217	s32i	a6, a5,  0
    218	l32i	a9, a3, 12
    219	__src_b	a7, a7, a8
    220	s32i	a7, a5,  4
    221	l32i	a6, a3, 16
    222	__src_b	a8, a8, a9
    223	s32i	a8, a5,  8
    224	addi	a3, a3, 16
    225	__src_b	a9, a9, a6
    226	s32i	a9, a5, 12
    227	addi	a5, a5, 16
    228#if !XCHAL_HAVE_LOOPS
    229	bne	a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
    230#endif /* !XCHAL_HAVE_LOOPS */
    231.Loop2done:
    232	bbci.l	a4, 3, .L12
    233	# copy 8 bytes
    234	l32i	a7, a3,  4
    235	l32i	a8, a3,  8
    236	__src_b	a6, a6, a7
    237	s32i	a6, a5,  0
    238	addi	a3, a3,  8
    239	__src_b	a7, a7, a8
    240	s32i	a7, a5,  4
    241	addi	a5, a5,  8
    242	mov	a6, a8
    243.L12:
    244	bbci.l	a4, 2, .L13
    245	# copy 4 bytes
    246	l32i	a7, a3,  4
    247	addi	a3, a3,  4
    248	__src_b	a6, a6, a7
    249	s32i	a6, a5,  0
    250	addi	a5, a5,  4
    251	mov	a6, a7
    252.L13:
    253#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
    254	add	a3, a3, a11	# readjust a3 with correct misalignment
    255#endif
    256	bbsi.l	a4, 1, .L14
    257	bbsi.l	a4, 0, .L15
    258.Ldone:	abi_ret_default
    259.L14:
    260	# copy 2 bytes
    261	l8ui	a6, a3,  0
    262	l8ui	a7, a3,  1
    263	addi	a3, a3,  2
    264	s8i	a6, a5,  0
    265	s8i	a7, a5,  1
    266	addi	a5, a5,  2
    267	bbsi.l	a4, 0, .L15
    268	abi_ret_default
    269.L15:
    270	# copy 1 byte
    271	l8ui	a6, a3,  0
    272	s8i	a6, a5,  0
    273	abi_ret_default
    274
    275ENDPROC(__memcpy)
    276
    277/*
    278 * void bcopy(const void *src, void *dest, size_t n);
    279 */
    280
    281ENTRY(bcopy)
    282
    283	abi_entry_default
    284	# a2=src, a3=dst, a4=len
    285	mov	a5, a3
    286	mov	a3, a2
    287	mov	a2, a5
    288	j	.Lmovecommon	# go to common code for memmove+bcopy
    289
    290ENDPROC(bcopy)
    291
    292/*
    293 * void *memmove(void *dst, const void *src, size_t len);
    294 *
    295 * This function is intended to do the same thing as the standard
    296 * library function memmove() for most cases.
    297 * However, where the source and/or destination references
    298 * an instruction RAM or ROM or a data RAM or ROM, that
    299 * source and/or destination will always be accessed with
    300 * 32-bit load and store instructions (as required for these
    301 * types of devices).
    302 *
    303 * !!!!!!!  XTFIXME:
    304 * !!!!!!!  Handling of IRAM/IROM has not yet
    305 * !!!!!!!  been implemented.
    306 *
    307 * The (general case) algorithm is as follows:
    308 *   If end of source doesn't overlap destination then use memcpy.
    309 *   Otherwise do memcpy backwards.
    310 *
    311 * Register use:
    312 *	a0/ return address
    313 *	a1/ stack pointer
    314 *	a2/ return value
    315 *	a3/ src
    316 *	a4/ length
    317 *	a5/ dst
    318 *	a6/ tmp
    319 *	a7/ tmp
    320 *	a8/ tmp
    321 *	a9/ tmp
    322 *	a10/ tmp
    323 *	a11/ tmp
    324 */
    325
    326/*
    327 * Byte by byte copy
    328 */
    329	.align	4
    330	.byte	0		# 1 mod 4 alignment for LOOPNEZ
    331				# (0 mod 4 alignment for LBEG)
    332.Lbackbytecopy:
    333#if XCHAL_HAVE_LOOPS
    334	loopnez	a4, .Lbackbytecopydone
    335#else /* !XCHAL_HAVE_LOOPS */
    336	beqz	a4, .Lbackbytecopydone
    337	sub	a7, a3, a4	# a7 = start address for source
    338#endif /* !XCHAL_HAVE_LOOPS */
    339.Lbacknextbyte:
    340	addi	a3, a3, -1
    341	l8ui	a6, a3, 0
    342	addi	a5, a5, -1
    343	s8i	a6, a5, 0
    344#if !XCHAL_HAVE_LOOPS
    345	bne	a3, a7, .Lbacknextbyte # continue loop if
    346				       # $a3:src != $a7:src_start
    347#endif /* !XCHAL_HAVE_LOOPS */
    348.Lbackbytecopydone:
    349	abi_ret_default
    350
    351/*
    352 * Destination is unaligned
    353 */
    354
    355	.align	4
    356.Lbackdst1mod2:	# dst is only byte aligned
    357	_bltui	a4, 7, .Lbackbytecopy	# do short copies byte by byte
    358
    359	# copy 1 byte
    360	addi	a3, a3, -1
    361	l8ui	a6, a3,  0
    362	addi	a5, a5, -1
    363	s8i	a6, a5,  0
    364	addi	a4, a4, -1
    365	_bbci.l	a5, 1, .Lbackdstaligned	# if dst is now aligned, then
    366					# return to main algorithm
    367.Lbackdst2mod4:	# dst 16-bit aligned
    368	# copy 2 bytes
    369	_bltui	a4, 6, .Lbackbytecopy	# do short copies byte by byte
    370	addi	a3, a3, -2
    371	l8ui	a6, a3,  0
    372	l8ui	a7, a3,  1
    373	addi	a5, a5, -2
    374	s8i	a6, a5,  0
    375	s8i	a7, a5,  1
    376	addi	a4, a4, -2
    377	j	.Lbackdstaligned	# dst is now aligned,
    378					# return to main algorithm
    379
    380ENTRY(__memmove)
    381WEAK(memmove)
    382
    383	abi_entry_default
    384	# a2/ dst, a3/ src, a4/ len
    385	mov	a5, a2		# copy dst so that a2 is return value
    386.Lmovecommon:
    387	sub	a6, a5, a3
    388	bgeu	a6, a4, .Lcommon
    389
    390	add	a5, a5, a4
    391	add	a3, a3, a4
    392
    393	_bbsi.l	a5, 0, .Lbackdst1mod2	# if dst is 1 mod 2
    394	_bbsi.l	a5, 1, .Lbackdst2mod4	# if dst is 2 mod 4
    395.Lbackdstaligned:	# return here from .Lbackdst?mod? once dst is aligned
    396	srli	a7, a4, 4	# number of loop iterations with 16B
    397				# per iteration
    398	movi	a8, 3		# if source is not aligned,
    399	_bany	a3, a8, .Lbacksrcunaligned	# then use shifting copy
    400	/*
    401	 * Destination and source are word-aligned, use word copy.
    402	 */
    403	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
    404#if XCHAL_HAVE_LOOPS
    405	loopnez	a7, .LbackLoop1done
    406#else /* !XCHAL_HAVE_LOOPS */
    407	beqz	a7, .LbackLoop1done
    408	slli	a8, a7, 4
    409	sub	a8, a3, a8	# a8 = start of first 16B source chunk
    410#endif /* !XCHAL_HAVE_LOOPS */
    411.LbackLoop1:
    412	addi	a3, a3, -16
    413	l32i	a7, a3, 12
    414	l32i	a6, a3,  8
    415	addi	a5, a5, -16
    416	s32i	a7, a5, 12
    417	l32i	a7, a3,  4
    418	s32i	a6, a5,  8
    419	l32i	a6, a3,  0
    420	s32i	a7, a5,  4
    421	s32i	a6, a5,  0
    422#if !XCHAL_HAVE_LOOPS
    423	bne	a3, a8, .LbackLoop1  # continue loop if a3:src != a8:src_start
    424#endif /* !XCHAL_HAVE_LOOPS */
    425.LbackLoop1done:
    426	bbci.l	a4, 3, .Lback2
    427	# copy 8 bytes
    428	addi	a3, a3, -8
    429	l32i	a6, a3,  0
    430	l32i	a7, a3,  4
    431	addi	a5, a5, -8
    432	s32i	a6, a5,  0
    433	s32i	a7, a5,  4
    434.Lback2:
    435	bbsi.l	a4, 2, .Lback3
    436	bbsi.l	a4, 1, .Lback4
    437	bbsi.l	a4, 0, .Lback5
    438	abi_ret_default
    439.Lback3:
    440	# copy 4 bytes
    441	addi	a3, a3, -4
    442	l32i	a6, a3,  0
    443	addi	a5, a5, -4
    444	s32i	a6, a5,  0
    445	bbsi.l	a4, 1, .Lback4
    446	bbsi.l	a4, 0, .Lback5
    447	abi_ret_default
    448.Lback4:
    449	# copy 2 bytes
    450	addi	a3, a3, -2
    451	l16ui	a6, a3,  0
    452	addi	a5, a5, -2
    453	s16i	a6, a5,  0
    454	bbsi.l	a4, 0, .Lback5
    455	abi_ret_default
    456.Lback5:
    457	# copy 1 byte
    458	addi	a3, a3, -1
    459	l8ui	a6, a3,  0
    460	addi	a5, a5, -1
    461	s8i	a6, a5,  0
    462	abi_ret_default
    463
    464/*
    465 * Destination is aligned, Source is unaligned
    466 */
    467
    468	.align	4
    469.Lbacksrcunaligned:
    470	_beqz	a4, .Lbackdone	# avoid loading anything for zero-length copies
    471	# copy 16 bytes per iteration for word-aligned dst and unaligned src
    472	__ssa8	a3		# set shift amount from byte offset
    473#define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS with
    474					 * the lint or ferret client, or 0
    475					 * to save a few cycles */
    476#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
    477	and	a11, a3, a8	# save unalignment offset for below
    478	sub	a3, a3, a11	# align a3
    479#endif
    480	l32i	a6, a3, 0	# load first word
    481#if XCHAL_HAVE_LOOPS
    482	loopnez	a7, .LbackLoop2done
    483#else /* !XCHAL_HAVE_LOOPS */
    484	beqz	a7, .LbackLoop2done
    485	slli	a10, a7, 4
    486	sub	a10, a3, a10	# a10 = start of first 16B source chunk
    487#endif /* !XCHAL_HAVE_LOOPS */
    488.LbackLoop2:
    489	addi	a3, a3, -16
    490	l32i	a7, a3, 12
    491	l32i	a8, a3,  8
    492	addi	a5, a5, -16
    493	__src_b	a6, a7, a6
    494	s32i	a6, a5, 12
    495	l32i	a9, a3,  4
    496	__src_b	a7, a8, a7
    497	s32i	a7, a5,  8
    498	l32i	a6, a3,  0
    499	__src_b	a8, a9, a8
    500	s32i	a8, a5,  4
    501	__src_b	a9, a6, a9
    502	s32i	a9, a5,  0
    503#if !XCHAL_HAVE_LOOPS
    504	bne	a3, a10, .LbackLoop2 # continue loop if a3:src != a10:src_start
    505#endif /* !XCHAL_HAVE_LOOPS */
    506.LbackLoop2done:
    507	bbci.l	a4, 3, .Lback12
    508	# copy 8 bytes
    509	addi	a3, a3, -8
    510	l32i	a7, a3,  4
    511	l32i	a8, a3,  0
    512	addi	a5, a5, -8
    513	__src_b	a6, a7, a6
    514	s32i	a6, a5,  4
    515	__src_b	a7, a8, a7
    516	s32i	a7, a5,  0
    517	mov	a6, a8
    518.Lback12:
    519	bbci.l	a4, 2, .Lback13
    520	# copy 4 bytes
    521	addi	a3, a3, -4
    522	l32i	a7, a3,  0
    523	addi	a5, a5, -4
    524	__src_b	a6, a7, a6
    525	s32i	a6, a5,  0
    526	mov	a6, a7
    527.Lback13:
    528#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
    529	add	a3, a3, a11	# readjust a3 with correct misalignment
    530#endif
    531	bbsi.l	a4, 1, .Lback14
    532	bbsi.l	a4, 0, .Lback15
    533.Lbackdone:
    534	abi_ret_default
    535.Lback14:
    536	# copy 2 bytes
    537	addi	a3, a3, -2
    538	l8ui	a6, a3,  0
    539	l8ui	a7, a3,  1
    540	addi	a5, a5, -2
    541	s8i	a6, a5,  0
    542	s8i	a7, a5,  1
    543	bbsi.l	a4, 0, .Lback15
    544	abi_ret_default
    545.Lback15:
    546	# copy 1 byte
    547	addi	a3, a3, -1
    548	addi	a5, a5, -1
    549	l8ui	a6, a3,  0
    550	s8i	a6, a5,  0
    551	abi_ret_default
    552
    553ENDPROC(__memmove)