aes-neon.S (7628B)
1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON 4 * 5 * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <asm/assembler.h> 10 11#define AES_FUNC_START(func) SYM_FUNC_START(neon_ ## func) 12#define AES_FUNC_END(func) SYM_FUNC_END(neon_ ## func) 13 14 xtsmask .req v7 15 cbciv .req v7 16 vctr .req v4 17 18 .macro xts_reload_mask, tmp 19 xts_load_mask \tmp 20 .endm 21 22 /* special case for the neon-bs driver calling into this one for CTS */ 23 .macro xts_cts_skip_tw, reg, lbl 24 tbnz \reg, #1, \lbl 25 .endm 26 27 /* multiply by polynomial 'x' in GF(2^8) */ 28 .macro mul_by_x, out, in, temp, const 29 sshr \temp, \in, #7 30 shl \out, \in, #1 31 and \temp, \temp, \const 32 eor \out, \out, \temp 33 .endm 34 35 /* multiply by polynomial 'x^2' in GF(2^8) */ 36 .macro mul_by_x2, out, in, temp, const 37 ushr \temp, \in, #6 38 shl \out, \in, #2 39 pmul \temp, \temp, \const 40 eor \out, \out, \temp 41 .endm 42 43 /* preload the entire Sbox */ 44 .macro prepare, sbox, shiftrows, temp 45 movi v12.16b, #0x1b 46 ldr_l q13, \shiftrows, \temp 47 ldr_l q14, .Lror32by8, \temp 48 adr_l \temp, \sbox 49 ld1 {v16.16b-v19.16b}, [\temp], #64 50 ld1 {v20.16b-v23.16b}, [\temp], #64 51 ld1 {v24.16b-v27.16b}, [\temp], #64 52 ld1 {v28.16b-v31.16b}, [\temp] 53 .endm 54 55 /* do preload for encryption */ 56 .macro enc_prepare, ignore0, ignore1, temp 57 prepare crypto_aes_sbox, .LForward_ShiftRows, \temp 58 .endm 59 60 .macro enc_switch_key, ignore0, ignore1, temp 61 /* do nothing */ 62 .endm 63 64 /* do preload for decryption */ 65 .macro dec_prepare, ignore0, ignore1, temp 66 prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp 67 .endm 68 69 /* apply SubBytes transformation using the the preloaded Sbox */ 70 .macro sub_bytes, in 71 sub v9.16b, \in\().16b, v15.16b 72 tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b 73 sub v10.16b, v9.16b, v15.16b 74 tbx \in\().16b, {v20.16b-v23.16b}, v9.16b 75 sub v11.16b, v10.16b, v15.16b 76 tbx \in\().16b, {v24.16b-v27.16b}, v10.16b 77 tbx \in\().16b, {v28.16b-v31.16b}, v11.16b 78 .endm 79 80 /* apply MixColumns transformation */ 81 .macro mix_columns, in, enc 82 .if \enc == 0 83 /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ 84 mul_by_x2 v8.16b, \in\().16b, v9.16b, v12.16b 85 eor \in\().16b, \in\().16b, v8.16b 86 rev32 v8.8h, v8.8h 87 eor \in\().16b, \in\().16b, v8.16b 88 .endif 89 90 mul_by_x v9.16b, \in\().16b, v8.16b, v12.16b 91 rev32 v8.8h, \in\().8h 92 eor v8.16b, v8.16b, v9.16b 93 eor \in\().16b, \in\().16b, v8.16b 94 tbl \in\().16b, {\in\().16b}, v14.16b 95 eor \in\().16b, \in\().16b, v8.16b 96 .endm 97 98 .macro do_block, enc, in, rounds, rk, rkp, i 99 ld1 {v15.4s}, [\rk] 100 add \rkp, \rk, #16 101 mov \i, \rounds 1021111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ 103 movi v15.16b, #0x40 104 tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ 105 sub_bytes \in 106 subs \i, \i, #1 107 ld1 {v15.4s}, [\rkp], #16 108 beq 2222f 109 mix_columns \in, \enc 110 b 1111b 1112222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ 112 .endm 113 114 .macro encrypt_block, in, rounds, rk, rkp, i 115 do_block 1, \in, \rounds, \rk, \rkp, \i 116 .endm 117 118 .macro decrypt_block, in, rounds, rk, rkp, i 119 do_block 0, \in, \rounds, \rk, \rkp, \i 120 .endm 121 122 /* 123 * Interleaved versions: functionally equivalent to the 124 * ones above, but applied to AES states in parallel. 125 */ 126 127 .macro sub_bytes_4x, in0, in1, in2, in3 128 sub v8.16b, \in0\().16b, v15.16b 129 tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b 130 sub v9.16b, \in1\().16b, v15.16b 131 tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b 132 sub v10.16b, \in2\().16b, v15.16b 133 tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b 134 sub v11.16b, \in3\().16b, v15.16b 135 tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b 136 tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b 137 tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b 138 sub v8.16b, v8.16b, v15.16b 139 tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b 140 sub v9.16b, v9.16b, v15.16b 141 tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b 142 sub v10.16b, v10.16b, v15.16b 143 tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b 144 sub v11.16b, v11.16b, v15.16b 145 tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b 146 sub v8.16b, v8.16b, v15.16b 147 tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b 148 sub v9.16b, v9.16b, v15.16b 149 tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b 150 sub v10.16b, v10.16b, v15.16b 151 tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b 152 sub v11.16b, v11.16b, v15.16b 153 tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b 154 tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b 155 tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b 156 .endm 157 158 .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const 159 sshr \tmp0\().16b, \in0\().16b, #7 160 shl \out0\().16b, \in0\().16b, #1 161 sshr \tmp1\().16b, \in1\().16b, #7 162 and \tmp0\().16b, \tmp0\().16b, \const\().16b 163 shl \out1\().16b, \in1\().16b, #1 164 and \tmp1\().16b, \tmp1\().16b, \const\().16b 165 eor \out0\().16b, \out0\().16b, \tmp0\().16b 166 eor \out1\().16b, \out1\().16b, \tmp1\().16b 167 .endm 168 169 .macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const 170 ushr \tmp0\().16b, \in0\().16b, #6 171 shl \out0\().16b, \in0\().16b, #2 172 ushr \tmp1\().16b, \in1\().16b, #6 173 pmul \tmp0\().16b, \tmp0\().16b, \const\().16b 174 shl \out1\().16b, \in1\().16b, #2 175 pmul \tmp1\().16b, \tmp1\().16b, \const\().16b 176 eor \out0\().16b, \out0\().16b, \tmp0\().16b 177 eor \out1\().16b, \out1\().16b, \tmp1\().16b 178 .endm 179 180 .macro mix_columns_2x, in0, in1, enc 181 .if \enc == 0 182 /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ 183 mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12 184 eor \in0\().16b, \in0\().16b, v8.16b 185 rev32 v8.8h, v8.8h 186 eor \in1\().16b, \in1\().16b, v9.16b 187 rev32 v9.8h, v9.8h 188 eor \in0\().16b, \in0\().16b, v8.16b 189 eor \in1\().16b, \in1\().16b, v9.16b 190 .endif 191 192 mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12 193 rev32 v10.8h, \in0\().8h 194 rev32 v11.8h, \in1\().8h 195 eor v10.16b, v10.16b, v8.16b 196 eor v11.16b, v11.16b, v9.16b 197 eor \in0\().16b, \in0\().16b, v10.16b 198 eor \in1\().16b, \in1\().16b, v11.16b 199 tbl \in0\().16b, {\in0\().16b}, v14.16b 200 tbl \in1\().16b, {\in1\().16b}, v14.16b 201 eor \in0\().16b, \in0\().16b, v10.16b 202 eor \in1\().16b, \in1\().16b, v11.16b 203 .endm 204 205 .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i 206 ld1 {v15.4s}, [\rk] 207 add \rkp, \rk, #16 208 mov \i, \rounds 2091111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 210 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 211 eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ 212 eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ 213 movi v15.16b, #0x40 214 tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ 215 tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ 216 tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ 217 tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ 218 sub_bytes_4x \in0, \in1, \in2, \in3 219 subs \i, \i, #1 220 ld1 {v15.4s}, [\rkp], #16 221 beq 2222f 222 mix_columns_2x \in0, \in1, \enc 223 mix_columns_2x \in2, \in3, \enc 224 b 1111b 2252222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 226 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 227 eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ 228 eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ 229 .endm 230 231 .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i 232 do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i 233 .endm 234 235 .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i 236 do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i 237 .endm 238 239#include "aes-modes.S" 240 241 .section ".rodata", "a" 242 .align 4 243.LForward_ShiftRows: 244 .octa 0x0b06010c07020d08030e09040f0a0500 245 246.LReverse_ShiftRows: 247 .octa 0x0306090c0f0205080b0e0104070a0d00 248 249.Lror32by8: 250 .octa 0x0c0f0e0d080b0a090407060500030201