memset_64.S (2817B)
1/* SPDX-License-Identifier: GPL-2.0 */ 2/* Copyright 2002 Andi Kleen, SuSE Labs */ 3 4#include <linux/linkage.h> 5#include <asm/cpufeatures.h> 6#include <asm/alternative.h> 7#include <asm/export.h> 8 9/* 10 * ISO C memset - set a memory block to a byte value. This function uses fast 11 * string to get better performance than the original function. The code is 12 * simpler and shorter than the original function as well. 13 * 14 * rdi destination 15 * rsi value (char) 16 * rdx count (bytes) 17 * 18 * rax original destination 19 */ 20SYM_FUNC_START(__memset) 21 /* 22 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended 23 * to use it when possible. If not available, use fast string instructions. 24 * 25 * Otherwise, use original memset function. 26 */ 27 ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ 28 "jmp memset_erms", X86_FEATURE_ERMS 29 30 movq %rdi,%r9 31 movq %rdx,%rcx 32 andl $7,%edx 33 shrq $3,%rcx 34 /* expand byte value */ 35 movzbl %sil,%esi 36 movabs $0x0101010101010101,%rax 37 imulq %rsi,%rax 38 rep stosq 39 movl %edx,%ecx 40 rep stosb 41 movq %r9,%rax 42 RET 43SYM_FUNC_END(__memset) 44EXPORT_SYMBOL(__memset) 45 46SYM_FUNC_ALIAS_WEAK(memset, __memset) 47EXPORT_SYMBOL(memset) 48 49/* 50 * ISO C memset - set a memory block to a byte value. This function uses 51 * enhanced rep stosb to override the fast string function. 52 * The code is simpler and shorter than the fast string function as well. 53 * 54 * rdi destination 55 * rsi value (char) 56 * rdx count (bytes) 57 * 58 * rax original destination 59 */ 60SYM_FUNC_START_LOCAL(memset_erms) 61 movq %rdi,%r9 62 movb %sil,%al 63 movq %rdx,%rcx 64 rep stosb 65 movq %r9,%rax 66 RET 67SYM_FUNC_END(memset_erms) 68 69SYM_FUNC_START_LOCAL(memset_orig) 70 movq %rdi,%r10 71 72 /* expand byte value */ 73 movzbl %sil,%ecx 74 movabs $0x0101010101010101,%rax 75 imulq %rcx,%rax 76 77 /* align dst */ 78 movl %edi,%r9d 79 andl $7,%r9d 80 jnz .Lbad_alignment 81.Lafter_bad_alignment: 82 83 movq %rdx,%rcx 84 shrq $6,%rcx 85 jz .Lhandle_tail 86 87 .p2align 4 88.Lloop_64: 89 decq %rcx 90 movq %rax,(%rdi) 91 movq %rax,8(%rdi) 92 movq %rax,16(%rdi) 93 movq %rax,24(%rdi) 94 movq %rax,32(%rdi) 95 movq %rax,40(%rdi) 96 movq %rax,48(%rdi) 97 movq %rax,56(%rdi) 98 leaq 64(%rdi),%rdi 99 jnz .Lloop_64 100 101 /* Handle tail in loops. The loops should be faster than hard 102 to predict jump tables. */ 103 .p2align 4 104.Lhandle_tail: 105 movl %edx,%ecx 106 andl $63&(~7),%ecx 107 jz .Lhandle_7 108 shrl $3,%ecx 109 .p2align 4 110.Lloop_8: 111 decl %ecx 112 movq %rax,(%rdi) 113 leaq 8(%rdi),%rdi 114 jnz .Lloop_8 115 116.Lhandle_7: 117 andl $7,%edx 118 jz .Lende 119 .p2align 4 120.Lloop_1: 121 decl %edx 122 movb %al,(%rdi) 123 leaq 1(%rdi),%rdi 124 jnz .Lloop_1 125 126.Lende: 127 movq %r10,%rax 128 RET 129 130.Lbad_alignment: 131 cmpq $7,%rdx 132 jbe .Lhandle_7 133 movq %rax,(%rdi) /* unaligned store */ 134 movq $8,%r8 135 subq %r9,%r8 136 addq %r8,%rdi 137 subq %r8,%rdx 138 jmp .Lafter_bad_alignment 139.Lfinal: 140SYM_FUNC_END(memset_orig)