cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

memcpy_64.S (3547B)


      1/* SPDX-License-Identifier: GPL-2.0-only */
      2/* Copyright 2002 Andi Kleen */
      3
      4#include <linux/linkage.h>
      5#include <asm/errno.h>
      6#include <asm/cpufeatures.h>
      7#include <asm/alternative.h>
      8#include <asm/export.h>
      9
     10.pushsection .noinstr.text, "ax"
     11
     12/*
     13 * We build a jump to memcpy_orig by default which gets NOPped out on
     14 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
     15 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
     16 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
     17 */
     18
     19/*
     20 * memcpy - Copy a memory block.
     21 *
     22 * Input:
     23 *  rdi destination
     24 *  rsi source
     25 *  rdx count
     26 *
     27 * Output:
     28 * rax original destination
     29 */
     30SYM_FUNC_START(__memcpy)
     31	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
     32		      "jmp memcpy_erms", X86_FEATURE_ERMS
     33
     34	movq %rdi, %rax
     35	movq %rdx, %rcx
     36	shrq $3, %rcx
     37	andl $7, %edx
     38	rep movsq
     39	movl %edx, %ecx
     40	rep movsb
     41	RET
     42SYM_FUNC_END(__memcpy)
     43EXPORT_SYMBOL(__memcpy)
     44
     45SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
     46EXPORT_SYMBOL(memcpy)
     47
     48/*
     49 * memcpy_erms() - enhanced fast string memcpy. This is faster and
     50 * simpler than memcpy. Use memcpy_erms when possible.
     51 */
     52SYM_FUNC_START_LOCAL(memcpy_erms)
     53	movq %rdi, %rax
     54	movq %rdx, %rcx
     55	rep movsb
     56	RET
     57SYM_FUNC_END(memcpy_erms)
     58
     59SYM_FUNC_START_LOCAL(memcpy_orig)
     60	movq %rdi, %rax
     61
     62	cmpq $0x20, %rdx
     63	jb .Lhandle_tail
     64
     65	/*
     66	 * We check whether memory false dependence could occur,
     67	 * then jump to corresponding copy mode.
     68	 */
     69	cmp  %dil, %sil
     70	jl .Lcopy_backward
     71	subq $0x20, %rdx
     72.Lcopy_forward_loop:
     73	subq $0x20,	%rdx
     74
     75	/*
     76	 * Move in blocks of 4x8 bytes:
     77	 */
     78	movq 0*8(%rsi),	%r8
     79	movq 1*8(%rsi),	%r9
     80	movq 2*8(%rsi),	%r10
     81	movq 3*8(%rsi),	%r11
     82	leaq 4*8(%rsi),	%rsi
     83
     84	movq %r8,	0*8(%rdi)
     85	movq %r9,	1*8(%rdi)
     86	movq %r10,	2*8(%rdi)
     87	movq %r11,	3*8(%rdi)
     88	leaq 4*8(%rdi),	%rdi
     89	jae  .Lcopy_forward_loop
     90	addl $0x20,	%edx
     91	jmp  .Lhandle_tail
     92
     93.Lcopy_backward:
     94	/*
     95	 * Calculate copy position to tail.
     96	 */
     97	addq %rdx,	%rsi
     98	addq %rdx,	%rdi
     99	subq $0x20,	%rdx
    100	/*
    101	 * At most 3 ALU operations in one cycle,
    102	 * so append NOPS in the same 16 bytes trunk.
    103	 */
    104	.p2align 4
    105.Lcopy_backward_loop:
    106	subq $0x20,	%rdx
    107	movq -1*8(%rsi),	%r8
    108	movq -2*8(%rsi),	%r9
    109	movq -3*8(%rsi),	%r10
    110	movq -4*8(%rsi),	%r11
    111	leaq -4*8(%rsi),	%rsi
    112	movq %r8,		-1*8(%rdi)
    113	movq %r9,		-2*8(%rdi)
    114	movq %r10,		-3*8(%rdi)
    115	movq %r11,		-4*8(%rdi)
    116	leaq -4*8(%rdi),	%rdi
    117	jae  .Lcopy_backward_loop
    118
    119	/*
    120	 * Calculate copy position to head.
    121	 */
    122	addl $0x20,	%edx
    123	subq %rdx,	%rsi
    124	subq %rdx,	%rdi
    125.Lhandle_tail:
    126	cmpl $16,	%edx
    127	jb   .Lless_16bytes
    128
    129	/*
    130	 * Move data from 16 bytes to 31 bytes.
    131	 */
    132	movq 0*8(%rsi), %r8
    133	movq 1*8(%rsi),	%r9
    134	movq -2*8(%rsi, %rdx),	%r10
    135	movq -1*8(%rsi, %rdx),	%r11
    136	movq %r8,	0*8(%rdi)
    137	movq %r9,	1*8(%rdi)
    138	movq %r10,	-2*8(%rdi, %rdx)
    139	movq %r11,	-1*8(%rdi, %rdx)
    140	RET
    141	.p2align 4
    142.Lless_16bytes:
    143	cmpl $8,	%edx
    144	jb   .Lless_8bytes
    145	/*
    146	 * Move data from 8 bytes to 15 bytes.
    147	 */
    148	movq 0*8(%rsi),	%r8
    149	movq -1*8(%rsi, %rdx),	%r9
    150	movq %r8,	0*8(%rdi)
    151	movq %r9,	-1*8(%rdi, %rdx)
    152	RET
    153	.p2align 4
    154.Lless_8bytes:
    155	cmpl $4,	%edx
    156	jb   .Lless_3bytes
    157
    158	/*
    159	 * Move data from 4 bytes to 7 bytes.
    160	 */
    161	movl (%rsi), %ecx
    162	movl -4(%rsi, %rdx), %r8d
    163	movl %ecx, (%rdi)
    164	movl %r8d, -4(%rdi, %rdx)
    165	RET
    166	.p2align 4
    167.Lless_3bytes:
    168	subl $1, %edx
    169	jb .Lend
    170	/*
    171	 * Move data from 1 bytes to 3 bytes.
    172	 */
    173	movzbl (%rsi), %ecx
    174	jz .Lstore_1byte
    175	movzbq 1(%rsi), %r8
    176	movzbq (%rsi, %rdx), %r9
    177	movb %r8b, 1(%rdi)
    178	movb %r9b, (%rdi, %rdx)
    179.Lstore_1byte:
    180	movb %cl, (%rdi)
    181
    182.Lend:
    183	RET
    184SYM_FUNC_END(memcpy_orig)
    185
    186.popsection