copy_template.S (4034B)
1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright (C) 2013 ARM Ltd. 4 * Copyright (C) 2013 Linaro. 5 * 6 * This code is based on glibc cortex strings work originally authored by Linaro 7 * be found @ 8 * 9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 * files/head:/src/aarch64/ 11 */ 12 13 14/* 15 * Copy a buffer from src to dest (alignment handled by the hardware) 16 * 17 * Parameters: 18 * x0 - dest 19 * x1 - src 20 * x2 - n 21 * Returns: 22 * x0 - dest 23 */ 24dstin .req x0 25src .req x1 26count .req x2 27tmp1 .req x3 28tmp1w .req w3 29tmp2 .req x4 30tmp2w .req w4 31dst .req x6 32 33A_l .req x7 34A_h .req x8 35B_l .req x9 36B_h .req x10 37C_l .req x11 38C_h .req x12 39D_l .req x13 40D_h .req x14 41 42 mov dst, dstin 43 cmp count, #16 44 /*When memory length is less than 16, the accessed are not aligned.*/ 45 b.lo .Ltiny15 46 47 neg tmp2, src 48 ands tmp2, tmp2, #15/* Bytes to reach alignment. */ 49 b.eq .LSrcAligned 50 sub count, count, tmp2 51 /* 52 * Copy the leading memory data from src to dst in an increasing 53 * address order.By this way,the risk of overwriting the source 54 * memory data is eliminated when the distance between src and 55 * dst is less than 16. The memory accesses here are alignment. 56 */ 57 tbz tmp2, #0, 1f 58 ldrb1 tmp1w, src, #1 59 strb1 tmp1w, dst, #1 601: 61 tbz tmp2, #1, 2f 62 ldrh1 tmp1w, src, #2 63 strh1 tmp1w, dst, #2 642: 65 tbz tmp2, #2, 3f 66 ldr1 tmp1w, src, #4 67 str1 tmp1w, dst, #4 683: 69 tbz tmp2, #3, .LSrcAligned 70 ldr1 tmp1, src, #8 71 str1 tmp1, dst, #8 72 73.LSrcAligned: 74 cmp count, #64 75 b.ge .Lcpy_over64 76 /* 77 * Deal with small copies quickly by dropping straight into the 78 * exit block. 79 */ 80.Ltail63: 81 /* 82 * Copy up to 48 bytes of data. At this point we only need the 83 * bottom 6 bits of count to be accurate. 84 */ 85 ands tmp1, count, #0x30 86 b.eq .Ltiny15 87 cmp tmp1w, #0x20 88 b.eq 1f 89 b.lt 2f 90 ldp1 A_l, A_h, src, #16 91 stp1 A_l, A_h, dst, #16 921: 93 ldp1 A_l, A_h, src, #16 94 stp1 A_l, A_h, dst, #16 952: 96 ldp1 A_l, A_h, src, #16 97 stp1 A_l, A_h, dst, #16 98.Ltiny15: 99 /* 100 * Prefer to break one ldp/stp into several load/store to access 101 * memory in an increasing address order,rather than to load/store 16 102 * bytes from (src-16) to (dst-16) and to backward the src to aligned 103 * address,which way is used in original cortex memcpy. If keeping 104 * the original memcpy process here, memmove need to satisfy the 105 * precondition that src address is at least 16 bytes bigger than dst 106 * address,otherwise some source data will be overwritten when memove 107 * call memcpy directly. To make memmove simpler and decouple the 108 * memcpy's dependency on memmove, withdrew the original process. 109 */ 110 tbz count, #3, 1f 111 ldr1 tmp1, src, #8 112 str1 tmp1, dst, #8 1131: 114 tbz count, #2, 2f 115 ldr1 tmp1w, src, #4 116 str1 tmp1w, dst, #4 1172: 118 tbz count, #1, 3f 119 ldrh1 tmp1w, src, #2 120 strh1 tmp1w, dst, #2 1213: 122 tbz count, #0, .Lexitfunc 123 ldrb1 tmp1w, src, #1 124 strb1 tmp1w, dst, #1 125 126 b .Lexitfunc 127 128.Lcpy_over64: 129 subs count, count, #128 130 b.ge .Lcpy_body_large 131 /* 132 * Less than 128 bytes to copy, so handle 64 here and then jump 133 * to the tail. 134 */ 135 ldp1 A_l, A_h, src, #16 136 stp1 A_l, A_h, dst, #16 137 ldp1 B_l, B_h, src, #16 138 ldp1 C_l, C_h, src, #16 139 stp1 B_l, B_h, dst, #16 140 stp1 C_l, C_h, dst, #16 141 ldp1 D_l, D_h, src, #16 142 stp1 D_l, D_h, dst, #16 143 144 tst count, #0x3f 145 b.ne .Ltail63 146 b .Lexitfunc 147 148 /* 149 * Critical loop. Start at a new cache line boundary. Assuming 150 * 64 bytes per line this ensures the entire loop is in one line. 151 */ 152 .p2align L1_CACHE_SHIFT 153.Lcpy_body_large: 154 /* pre-get 64 bytes data. */ 155 ldp1 A_l, A_h, src, #16 156 ldp1 B_l, B_h, src, #16 157 ldp1 C_l, C_h, src, #16 158 ldp1 D_l, D_h, src, #16 1591: 160 /* 161 * interlace the load of next 64 bytes data block with store of the last 162 * loaded 64 bytes data. 163 */ 164 stp1 A_l, A_h, dst, #16 165 ldp1 A_l, A_h, src, #16 166 stp1 B_l, B_h, dst, #16 167 ldp1 B_l, B_h, src, #16 168 stp1 C_l, C_h, dst, #16 169 ldp1 C_l, C_h, src, #16 170 stp1 D_l, D_h, dst, #16 171 ldp1 D_l, D_h, src, #16 172 subs count, count, #64 173 b.ge 1b 174 stp1 A_l, A_h, dst, #16 175 stp1 B_l, B_h, dst, #16 176 stp1 C_l, C_h, dst, #16 177 stp1 D_l, D_h, dst, #16 178 179 tst count, #0x3f 180 b.ne .Ltail63 181.Lexitfunc: