aegis128-neon-inner.c (8609B)
1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> 4 */ 5 6#ifdef CONFIG_ARM64 7#include <asm/neon-intrinsics.h> 8 9#define AES_ROUND "aese %0.16b, %1.16b \n\t aesmc %0.16b, %0.16b" 10#else 11#include <arm_neon.h> 12 13#define AES_ROUND "aese.8 %q0, %q1 \n\t aesmc.8 %q0, %q0" 14#endif 15 16#define AEGIS_BLOCK_SIZE 16 17 18#include <stddef.h> 19 20extern int aegis128_have_aes_insn; 21 22void *memcpy(void *dest, const void *src, size_t n); 23 24struct aegis128_state { 25 uint8x16_t v[5]; 26}; 27 28extern const uint8_t crypto_aes_sbox[]; 29 30static struct aegis128_state aegis128_load_state_neon(const void *state) 31{ 32 return (struct aegis128_state){ { 33 vld1q_u8(state), 34 vld1q_u8(state + 16), 35 vld1q_u8(state + 32), 36 vld1q_u8(state + 48), 37 vld1q_u8(state + 64) 38 } }; 39} 40 41static void aegis128_save_state_neon(struct aegis128_state st, void *state) 42{ 43 vst1q_u8(state, st.v[0]); 44 vst1q_u8(state + 16, st.v[1]); 45 vst1q_u8(state + 32, st.v[2]); 46 vst1q_u8(state + 48, st.v[3]); 47 vst1q_u8(state + 64, st.v[4]); 48} 49 50static inline __attribute__((always_inline)) 51uint8x16_t aegis_aes_round(uint8x16_t w) 52{ 53 uint8x16_t z = {}; 54 55#ifdef CONFIG_ARM64 56 if (!__builtin_expect(aegis128_have_aes_insn, 1)) { 57 static const uint8_t shift_rows[] = { 58 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, 59 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, 60 }; 61 static const uint8_t ror32by8[] = { 62 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, 63 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, 64 }; 65 uint8x16_t v; 66 67 // shift rows 68 w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); 69 70 // sub bytes 71#ifndef CONFIG_CC_IS_GCC 72 v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), w); 73 v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), w - 0x40); 74 v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), w - 0x80); 75 v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), w - 0xc0); 76#else 77 asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w)); 78 w -= 0x40; 79 asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w)); 80 w -= 0x40; 81 asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w)); 82 w -= 0x40; 83 asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w)); 84#endif 85 86 // mix columns 87 w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b); 88 w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v); 89 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); 90 91 return w; 92 } 93#endif 94 95 /* 96 * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics 97 * to force the compiler to issue the aese/aesmc instructions in pairs. 98 * This is much faster on many cores, where the instruction pair can 99 * execute in a single cycle. 100 */ 101 asm(AES_ROUND : "+w"(w) : "w"(z)); 102 return w; 103} 104 105static inline __attribute__((always_inline)) 106struct aegis128_state aegis128_update_neon(struct aegis128_state st, 107 uint8x16_t m) 108{ 109 m ^= aegis_aes_round(st.v[4]); 110 st.v[4] ^= aegis_aes_round(st.v[3]); 111 st.v[3] ^= aegis_aes_round(st.v[2]); 112 st.v[2] ^= aegis_aes_round(st.v[1]); 113 st.v[1] ^= aegis_aes_round(st.v[0]); 114 st.v[0] ^= m; 115 116 return st; 117} 118 119static inline __attribute__((always_inline)) 120void preload_sbox(void) 121{ 122 if (!IS_ENABLED(CONFIG_ARM64) || 123 !IS_ENABLED(CONFIG_CC_IS_GCC) || 124 __builtin_expect(aegis128_have_aes_insn, 1)) 125 return; 126 127 asm("ld1 {v16.16b-v19.16b}, [%0], #64 \n\t" 128 "ld1 {v20.16b-v23.16b}, [%0], #64 \n\t" 129 "ld1 {v24.16b-v27.16b}, [%0], #64 \n\t" 130 "ld1 {v28.16b-v31.16b}, [%0] \n\t" 131 :: "r"(crypto_aes_sbox)); 132} 133 134void crypto_aegis128_init_neon(void *state, const void *key, const void *iv) 135{ 136 static const uint8_t const0[] = { 137 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d, 138 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62, 139 }; 140 static const uint8_t const1[] = { 141 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1, 142 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd, 143 }; 144 uint8x16_t k = vld1q_u8(key); 145 uint8x16_t kiv = k ^ vld1q_u8(iv); 146 struct aegis128_state st = {{ 147 kiv, 148 vld1q_u8(const1), 149 vld1q_u8(const0), 150 k ^ vld1q_u8(const0), 151 k ^ vld1q_u8(const1), 152 }}; 153 int i; 154 155 preload_sbox(); 156 157 for (i = 0; i < 5; i++) { 158 st = aegis128_update_neon(st, k); 159 st = aegis128_update_neon(st, kiv); 160 } 161 aegis128_save_state_neon(st, state); 162} 163 164void crypto_aegis128_update_neon(void *state, const void *msg) 165{ 166 struct aegis128_state st = aegis128_load_state_neon(state); 167 168 preload_sbox(); 169 170 st = aegis128_update_neon(st, vld1q_u8(msg)); 171 172 aegis128_save_state_neon(st, state); 173} 174 175#ifdef CONFIG_ARM 176/* 177 * AArch32 does not provide these intrinsics natively because it does not 178 * implement the underlying instructions. AArch32 only provides 64-bit 179 * wide vtbl.8/vtbx.8 instruction, so use those instead. 180 */ 181static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b) 182{ 183 union { 184 uint8x16_t val; 185 uint8x8x2_t pair; 186 } __a = { a }; 187 188 return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)), 189 vtbl2_u8(__a.pair, vget_high_u8(b))); 190} 191 192static uint8x16_t vqtbx1q_u8(uint8x16_t v, uint8x16_t a, uint8x16_t b) 193{ 194 union { 195 uint8x16_t val; 196 uint8x8x2_t pair; 197 } __a = { a }; 198 199 return vcombine_u8(vtbx2_u8(vget_low_u8(v), __a.pair, vget_low_u8(b)), 200 vtbx2_u8(vget_high_u8(v), __a.pair, vget_high_u8(b))); 201} 202 203static int8_t vminvq_s8(int8x16_t v) 204{ 205 int8x8_t s = vpmin_s8(vget_low_s8(v), vget_high_s8(v)); 206 207 s = vpmin_s8(s, s); 208 s = vpmin_s8(s, s); 209 s = vpmin_s8(s, s); 210 211 return vget_lane_s8(s, 0); 212} 213#endif 214 215static const uint8_t permute[] __aligned(64) = { 216 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 217 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 218 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 219}; 220 221void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src, 222 unsigned int size) 223{ 224 struct aegis128_state st = aegis128_load_state_neon(state); 225 const int short_input = size < AEGIS_BLOCK_SIZE; 226 uint8x16_t msg; 227 228 preload_sbox(); 229 230 while (size >= AEGIS_BLOCK_SIZE) { 231 uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; 232 233 msg = vld1q_u8(src); 234 st = aegis128_update_neon(st, msg); 235 msg ^= s; 236 vst1q_u8(dst, msg); 237 238 size -= AEGIS_BLOCK_SIZE; 239 src += AEGIS_BLOCK_SIZE; 240 dst += AEGIS_BLOCK_SIZE; 241 } 242 243 if (size > 0) { 244 uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; 245 uint8_t buf[AEGIS_BLOCK_SIZE]; 246 const void *in = src; 247 void *out = dst; 248 uint8x16_t m; 249 250 if (__builtin_expect(short_input, 0)) 251 in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size); 252 253 m = vqtbl1q_u8(vld1q_u8(in + size - AEGIS_BLOCK_SIZE), 254 vld1q_u8(permute + 32 - size)); 255 256 st = aegis128_update_neon(st, m); 257 258 vst1q_u8(out + size - AEGIS_BLOCK_SIZE, 259 vqtbl1q_u8(m ^ s, vld1q_u8(permute + size))); 260 261 if (__builtin_expect(short_input, 0)) 262 memcpy(dst, out, size); 263 else 264 vst1q_u8(out - AEGIS_BLOCK_SIZE, msg); 265 } 266 267 aegis128_save_state_neon(st, state); 268} 269 270void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src, 271 unsigned int size) 272{ 273 struct aegis128_state st = aegis128_load_state_neon(state); 274 const int short_input = size < AEGIS_BLOCK_SIZE; 275 uint8x16_t msg; 276 277 preload_sbox(); 278 279 while (size >= AEGIS_BLOCK_SIZE) { 280 msg = vld1q_u8(src) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; 281 st = aegis128_update_neon(st, msg); 282 vst1q_u8(dst, msg); 283 284 size -= AEGIS_BLOCK_SIZE; 285 src += AEGIS_BLOCK_SIZE; 286 dst += AEGIS_BLOCK_SIZE; 287 } 288 289 if (size > 0) { 290 uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; 291 uint8_t buf[AEGIS_BLOCK_SIZE]; 292 const void *in = src; 293 void *out = dst; 294 uint8x16_t m; 295 296 if (__builtin_expect(short_input, 0)) 297 in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size); 298 299 m = s ^ vqtbx1q_u8(s, vld1q_u8(in + size - AEGIS_BLOCK_SIZE), 300 vld1q_u8(permute + 32 - size)); 301 302 st = aegis128_update_neon(st, m); 303 304 vst1q_u8(out + size - AEGIS_BLOCK_SIZE, 305 vqtbl1q_u8(m, vld1q_u8(permute + size))); 306 307 if (__builtin_expect(short_input, 0)) 308 memcpy(dst, out, size); 309 else 310 vst1q_u8(out - AEGIS_BLOCK_SIZE, msg); 311 } 312 313 aegis128_save_state_neon(st, state); 314} 315 316int crypto_aegis128_final_neon(void *state, void *tag_xor, 317 unsigned int assoclen, 318 unsigned int cryptlen, 319 unsigned int authsize) 320{ 321 struct aegis128_state st = aegis128_load_state_neon(state); 322 uint8x16_t v; 323 int i; 324 325 preload_sbox(); 326 327 v = st.v[3] ^ (uint8x16_t)vcombine_u64(vmov_n_u64(8ULL * assoclen), 328 vmov_n_u64(8ULL * cryptlen)); 329 330 for (i = 0; i < 7; i++) 331 st = aegis128_update_neon(st, v); 332 333 v = st.v[0] ^ st.v[1] ^ st.v[2] ^ st.v[3] ^ st.v[4]; 334 335 if (authsize > 0) { 336 v = vqtbl1q_u8(~vceqq_u8(v, vld1q_u8(tag_xor)), 337 vld1q_u8(permute + authsize)); 338 339 return vminvq_s8((int8x16_t)v); 340 } 341 342 vst1q_u8(tag_xor, v); 343 return 0; 344}