copy_template.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
copy_template.S (4034B)
      1/* SPDX-License-Identifier: GPL-2.0-only */
      2/*
      3 * Copyright (C) 2013 ARM Ltd.
      4 * Copyright (C) 2013 Linaro.
      5 *
      6 * This code is based on glibc cortex strings work originally authored by Linaro
      7 * be found @
      8 *
      9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
     10 * files/head:/src/aarch64/
     11 */
     12
     13
     14/*
     15 * Copy a buffer from src to dest (alignment handled by the hardware)
     16 *
     17 * Parameters:
     18 *	x0 - dest
     19 *	x1 - src
     20 *	x2 - n
     21 * Returns:
     22 *	x0 - dest
     23 */
     24dstin	.req	x0
     25src	.req	x1
     26count	.req	x2
     27tmp1	.req	x3
     28tmp1w	.req	w3
     29tmp2	.req	x4
     30tmp2w	.req	w4
     31dst	.req	x6
     32
     33A_l	.req	x7
     34A_h	.req	x8
     35B_l	.req	x9
     36B_h	.req	x10
     37C_l	.req	x11
     38C_h	.req	x12
     39D_l	.req	x13
     40D_h	.req	x14
     41
     42	mov	dst, dstin
     43	cmp	count, #16
     44	/*When memory length is less than 16, the accessed are not aligned.*/
     45	b.lo	.Ltiny15
     46
     47	neg	tmp2, src
     48	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
     49	b.eq	.LSrcAligned
     50	sub	count, count, tmp2
     51	/*
     52	* Copy the leading memory data from src to dst in an increasing
     53	* address order.By this way,the risk of overwriting the source
     54	* memory data is eliminated when the distance between src and
     55	* dst is less than 16. The memory accesses here are alignment.
     56	*/
     57	tbz	tmp2, #0, 1f
     58	ldrb1	tmp1w, src, #1
     59	strb1	tmp1w, dst, #1
     601:
     61	tbz	tmp2, #1, 2f
     62	ldrh1	tmp1w, src, #2
     63	strh1	tmp1w, dst, #2
     642:
     65	tbz	tmp2, #2, 3f
     66	ldr1	tmp1w, src, #4
     67	str1	tmp1w, dst, #4
     683:
     69	tbz	tmp2, #3, .LSrcAligned
     70	ldr1	tmp1, src, #8
     71	str1	tmp1, dst, #8
     72
     73.LSrcAligned:
     74	cmp	count, #64
     75	b.ge	.Lcpy_over64
     76	/*
     77	* Deal with small copies quickly by dropping straight into the
     78	* exit block.
     79	*/
     80.Ltail63:
     81	/*
     82	* Copy up to 48 bytes of data. At this point we only need the
     83	* bottom 6 bits of count to be accurate.
     84	*/
     85	ands	tmp1, count, #0x30
     86	b.eq	.Ltiny15
     87	cmp	tmp1w, #0x20
     88	b.eq	1f
     89	b.lt	2f
     90	ldp1	A_l, A_h, src, #16
     91	stp1	A_l, A_h, dst, #16
     921:
     93	ldp1	A_l, A_h, src, #16
     94	stp1	A_l, A_h, dst, #16
     952:
     96	ldp1	A_l, A_h, src, #16
     97	stp1	A_l, A_h, dst, #16
     98.Ltiny15:
     99	/*
    100	* Prefer to break one ldp/stp into several load/store to access
    101	* memory in an increasing address order,rather than to load/store 16
    102	* bytes from (src-16) to (dst-16) and to backward the src to aligned
    103	* address,which way is used in original cortex memcpy. If keeping
    104	* the original memcpy process here, memmove need to satisfy the
    105	* precondition that src address is at least 16 bytes bigger than dst
    106	* address,otherwise some source data will be overwritten when memove
    107	* call memcpy directly. To make memmove simpler and decouple the
    108	* memcpy's dependency on memmove, withdrew the original process.
    109	*/
    110	tbz	count, #3, 1f
    111	ldr1	tmp1, src, #8
    112	str1	tmp1, dst, #8
    1131:
    114	tbz	count, #2, 2f
    115	ldr1	tmp1w, src, #4
    116	str1	tmp1w, dst, #4
    1172:
    118	tbz	count, #1, 3f
    119	ldrh1	tmp1w, src, #2
    120	strh1	tmp1w, dst, #2
    1213:
    122	tbz	count, #0, .Lexitfunc
    123	ldrb1	tmp1w, src, #1
    124	strb1	tmp1w, dst, #1
    125
    126	b	.Lexitfunc
    127
    128.Lcpy_over64:
    129	subs	count, count, #128
    130	b.ge	.Lcpy_body_large
    131	/*
    132	* Less than 128 bytes to copy, so handle 64 here and then jump
    133	* to the tail.
    134	*/
    135	ldp1	A_l, A_h, src, #16
    136	stp1	A_l, A_h, dst, #16
    137	ldp1	B_l, B_h, src, #16
    138	ldp1	C_l, C_h, src, #16
    139	stp1	B_l, B_h, dst, #16
    140	stp1	C_l, C_h, dst, #16
    141	ldp1	D_l, D_h, src, #16
    142	stp1	D_l, D_h, dst, #16
    143
    144	tst	count, #0x3f
    145	b.ne	.Ltail63
    146	b	.Lexitfunc
    147
    148	/*
    149	* Critical loop.  Start at a new cache line boundary.  Assuming
    150	* 64 bytes per line this ensures the entire loop is in one line.
    151	*/
    152	.p2align	L1_CACHE_SHIFT
    153.Lcpy_body_large:
    154	/* pre-get 64 bytes data. */
    155	ldp1	A_l, A_h, src, #16
    156	ldp1	B_l, B_h, src, #16
    157	ldp1	C_l, C_h, src, #16
    158	ldp1	D_l, D_h, src, #16
    1591:
    160	/*
    161	* interlace the load of next 64 bytes data block with store of the last
    162	* loaded 64 bytes data.
    163	*/
    164	stp1	A_l, A_h, dst, #16
    165	ldp1	A_l, A_h, src, #16
    166	stp1	B_l, B_h, dst, #16
    167	ldp1	B_l, B_h, src, #16
    168	stp1	C_l, C_h, dst, #16
    169	ldp1	C_l, C_h, src, #16
    170	stp1	D_l, D_h, dst, #16
    171	ldp1	D_l, D_h, src, #16
    172	subs	count, count, #64
    173	b.ge	1b
    174	stp1	A_l, A_h, dst, #16
    175	stp1	B_l, B_h, dst, #16
    176	stp1	C_l, C_h, dst, #16
    177	stp1	D_l, D_h, dst, #16
    178
    179	tst	count, #0x3f
    180	b.ne	.Ltail63
    181.Lexitfunc: