memcpy.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
memcpy.S (5912B)
      1/* SPDX-License-Identifier: GPL-2.0-only */
      2/*
      3 * Copyright (c) 2012-2021, Arm Limited.
      4 *
      5 * Adapted from the original at:
      6 * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
      7 */
      8
      9#include <linux/linkage.h>
     10#include <asm/assembler.h>
     11
     12/* Assumptions:
     13 *
     14 * ARMv8-a, AArch64, unaligned accesses.
     15 *
     16 */
     17
     18#define L(label) .L ## label
     19
     20#define dstin	x0
     21#define src	x1
     22#define count	x2
     23#define dst	x3
     24#define srcend	x4
     25#define dstend	x5
     26#define A_l	x6
     27#define A_lw	w6
     28#define A_h	x7
     29#define B_l	x8
     30#define B_lw	w8
     31#define B_h	x9
     32#define C_l	x10
     33#define C_lw	w10
     34#define C_h	x11
     35#define D_l	x12
     36#define D_h	x13
     37#define E_l	x14
     38#define E_h	x15
     39#define F_l	x16
     40#define F_h	x17
     41#define G_l	count
     42#define G_h	dst
     43#define H_l	src
     44#define H_h	srcend
     45#define tmp1	x14
     46
     47/* This implementation handles overlaps and supports both memcpy and memmove
     48   from a single entry point.  It uses unaligned accesses and branchless
     49   sequences to keep the code small, simple and improve performance.
     50
     51   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
     52   copies of up to 128 bytes, and large copies.  The overhead of the overlap
     53   check is negligible since it is only required for large copies.
     54
     55   Large copies use a software pipelined loop processing 64 bytes per iteration.
     56   The destination pointer is 16-byte aligned to minimize unaligned accesses.
     57   The loop tail is handled by always copying 64 bytes from the end.
     58*/
     59
     60SYM_FUNC_START(__pi_memcpy)
     61	add	srcend, src, count
     62	add	dstend, dstin, count
     63	cmp	count, 128
     64	b.hi	L(copy_long)
     65	cmp	count, 32
     66	b.hi	L(copy32_128)
     67
     68	/* Small copies: 0..32 bytes.  */
     69	cmp	count, 16
     70	b.lo	L(copy16)
     71	ldp	A_l, A_h, [src]
     72	ldp	D_l, D_h, [srcend, -16]
     73	stp	A_l, A_h, [dstin]
     74	stp	D_l, D_h, [dstend, -16]
     75	ret
     76
     77	/* Copy 8-15 bytes.  */
     78L(copy16):
     79	tbz	count, 3, L(copy8)
     80	ldr	A_l, [src]
     81	ldr	A_h, [srcend, -8]
     82	str	A_l, [dstin]
     83	str	A_h, [dstend, -8]
     84	ret
     85
     86	.p2align 3
     87	/* Copy 4-7 bytes.  */
     88L(copy8):
     89	tbz	count, 2, L(copy4)
     90	ldr	A_lw, [src]
     91	ldr	B_lw, [srcend, -4]
     92	str	A_lw, [dstin]
     93	str	B_lw, [dstend, -4]
     94	ret
     95
     96	/* Copy 0..3 bytes using a branchless sequence.  */
     97L(copy4):
     98	cbz	count, L(copy0)
     99	lsr	tmp1, count, 1
    100	ldrb	A_lw, [src]
    101	ldrb	C_lw, [srcend, -1]
    102	ldrb	B_lw, [src, tmp1]
    103	strb	A_lw, [dstin]
    104	strb	B_lw, [dstin, tmp1]
    105	strb	C_lw, [dstend, -1]
    106L(copy0):
    107	ret
    108
    109	.p2align 4
    110	/* Medium copies: 33..128 bytes.  */
    111L(copy32_128):
    112	ldp	A_l, A_h, [src]
    113	ldp	B_l, B_h, [src, 16]
    114	ldp	C_l, C_h, [srcend, -32]
    115	ldp	D_l, D_h, [srcend, -16]
    116	cmp	count, 64
    117	b.hi	L(copy128)
    118	stp	A_l, A_h, [dstin]
    119	stp	B_l, B_h, [dstin, 16]
    120	stp	C_l, C_h, [dstend, -32]
    121	stp	D_l, D_h, [dstend, -16]
    122	ret
    123
    124	.p2align 4
    125	/* Copy 65..128 bytes.  */
    126L(copy128):
    127	ldp	E_l, E_h, [src, 32]
    128	ldp	F_l, F_h, [src, 48]
    129	cmp	count, 96
    130	b.ls	L(copy96)
    131	ldp	G_l, G_h, [srcend, -64]
    132	ldp	H_l, H_h, [srcend, -48]
    133	stp	G_l, G_h, [dstend, -64]
    134	stp	H_l, H_h, [dstend, -48]
    135L(copy96):
    136	stp	A_l, A_h, [dstin]
    137	stp	B_l, B_h, [dstin, 16]
    138	stp	E_l, E_h, [dstin, 32]
    139	stp	F_l, F_h, [dstin, 48]
    140	stp	C_l, C_h, [dstend, -32]
    141	stp	D_l, D_h, [dstend, -16]
    142	ret
    143
    144	.p2align 4
    145	/* Copy more than 128 bytes.  */
    146L(copy_long):
    147	/* Use backwards copy if there is an overlap.  */
    148	sub	tmp1, dstin, src
    149	cbz	tmp1, L(copy0)
    150	cmp	tmp1, count
    151	b.lo	L(copy_long_backwards)
    152
    153	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
    154
    155	ldp	D_l, D_h, [src]
    156	and	tmp1, dstin, 15
    157	bic	dst, dstin, 15
    158	sub	src, src, tmp1
    159	add	count, count, tmp1	/* Count is now 16 too large.  */
    160	ldp	A_l, A_h, [src, 16]
    161	stp	D_l, D_h, [dstin]
    162	ldp	B_l, B_h, [src, 32]
    163	ldp	C_l, C_h, [src, 48]
    164	ldp	D_l, D_h, [src, 64]!
    165	subs	count, count, 128 + 16	/* Test and readjust count.  */
    166	b.ls	L(copy64_from_end)
    167
    168L(loop64):
    169	stp	A_l, A_h, [dst, 16]
    170	ldp	A_l, A_h, [src, 16]
    171	stp	B_l, B_h, [dst, 32]
    172	ldp	B_l, B_h, [src, 32]
    173	stp	C_l, C_h, [dst, 48]
    174	ldp	C_l, C_h, [src, 48]
    175	stp	D_l, D_h, [dst, 64]!
    176	ldp	D_l, D_h, [src, 64]!
    177	subs	count, count, 64
    178	b.hi	L(loop64)
    179
    180	/* Write the last iteration and copy 64 bytes from the end.  */
    181L(copy64_from_end):
    182	ldp	E_l, E_h, [srcend, -64]
    183	stp	A_l, A_h, [dst, 16]
    184	ldp	A_l, A_h, [srcend, -48]
    185	stp	B_l, B_h, [dst, 32]
    186	ldp	B_l, B_h, [srcend, -32]
    187	stp	C_l, C_h, [dst, 48]
    188	ldp	C_l, C_h, [srcend, -16]
    189	stp	D_l, D_h, [dst, 64]
    190	stp	E_l, E_h, [dstend, -64]
    191	stp	A_l, A_h, [dstend, -48]
    192	stp	B_l, B_h, [dstend, -32]
    193	stp	C_l, C_h, [dstend, -16]
    194	ret
    195
    196	.p2align 4
    197
    198	/* Large backwards copy for overlapping copies.
    199	   Copy 16 bytes and then align dst to 16-byte alignment.  */
    200L(copy_long_backwards):
    201	ldp	D_l, D_h, [srcend, -16]
    202	and	tmp1, dstend, 15
    203	sub	srcend, srcend, tmp1
    204	sub	count, count, tmp1
    205	ldp	A_l, A_h, [srcend, -16]
    206	stp	D_l, D_h, [dstend, -16]
    207	ldp	B_l, B_h, [srcend, -32]
    208	ldp	C_l, C_h, [srcend, -48]
    209	ldp	D_l, D_h, [srcend, -64]!
    210	sub	dstend, dstend, tmp1
    211	subs	count, count, 128
    212	b.ls	L(copy64_from_start)
    213
    214L(loop64_backwards):
    215	stp	A_l, A_h, [dstend, -16]
    216	ldp	A_l, A_h, [srcend, -16]
    217	stp	B_l, B_h, [dstend, -32]
    218	ldp	B_l, B_h, [srcend, -32]
    219	stp	C_l, C_h, [dstend, -48]
    220	ldp	C_l, C_h, [srcend, -48]
    221	stp	D_l, D_h, [dstend, -64]!
    222	ldp	D_l, D_h, [srcend, -64]!
    223	subs	count, count, 64
    224	b.hi	L(loop64_backwards)
    225
    226	/* Write the last iteration and copy 64 bytes from the start.  */
    227L(copy64_from_start):
    228	ldp	G_l, G_h, [src, 48]
    229	stp	A_l, A_h, [dstend, -16]
    230	ldp	A_l, A_h, [src, 32]
    231	stp	B_l, B_h, [dstend, -32]
    232	ldp	B_l, B_h, [src, 16]
    233	stp	C_l, C_h, [dstend, -48]
    234	ldp	C_l, C_h, [src]
    235	stp	D_l, D_h, [dstend, -64]
    236	stp	G_l, G_h, [dstin, 48]
    237	stp	A_l, A_h, [dstin, 32]
    238	stp	B_l, B_h, [dstin, 16]
    239	stp	C_l, C_h, [dstin]
    240	ret
    241SYM_FUNC_END(__pi_memcpy)
    242
    243SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
    244EXPORT_SYMBOL(__memcpy)
    245SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
    246EXPORT_SYMBOL(memcpy)
    247
    248SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
    249
    250SYM_FUNC_ALIAS(__memmove, __pi_memmove)
    251EXPORT_SYMBOL(__memmove)
    252SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
    253EXPORT_SYMBOL(memmove)