ghash-ce-core.S (6891B)
1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. 4 * 5 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <asm/assembler.h> 10 11 .arch armv8-a 12 .fpu crypto-neon-fp-armv8 13 14 SHASH .req q0 15 T1 .req q1 16 XL .req q2 17 XM .req q3 18 XH .req q4 19 IN1 .req q4 20 21 SHASH_L .req d0 22 SHASH_H .req d1 23 T1_L .req d2 24 T1_H .req d3 25 XL_L .req d4 26 XL_H .req d5 27 XM_L .req d6 28 XM_H .req d7 29 XH_L .req d8 30 31 t0l .req d10 32 t0h .req d11 33 t1l .req d12 34 t1h .req d13 35 t2l .req d14 36 t2h .req d15 37 t3l .req d16 38 t3h .req d17 39 t4l .req d18 40 t4h .req d19 41 42 t0q .req q5 43 t1q .req q6 44 t2q .req q7 45 t3q .req q8 46 t4q .req q9 47 T2 .req q9 48 49 s1l .req d20 50 s1h .req d21 51 s2l .req d22 52 s2h .req d23 53 s3l .req d24 54 s3h .req d25 55 s4l .req d26 56 s4h .req d27 57 58 MASK .req d28 59 SHASH2_p8 .req d28 60 61 k16 .req d29 62 k32 .req d30 63 k48 .req d31 64 SHASH2_p64 .req d31 65 66 HH .req q10 67 HH3 .req q11 68 HH4 .req q12 69 HH34 .req q13 70 71 HH_L .req d20 72 HH_H .req d21 73 HH3_L .req d22 74 HH3_H .req d23 75 HH4_L .req d24 76 HH4_H .req d25 77 HH34_L .req d26 78 HH34_H .req d27 79 SHASH2_H .req d29 80 81 XL2 .req q5 82 XM2 .req q6 83 XH2 .req q7 84 T3 .req q8 85 86 XL2_L .req d10 87 XL2_H .req d11 88 XM2_L .req d12 89 XM2_H .req d13 90 T3_L .req d16 91 T3_H .req d17 92 93 .text 94 95 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 96 vmull.p64 \rd, \rn, \rm 97 .endm 98 99 /* 100 * This implementation of 64x64 -> 128 bit polynomial multiplication 101 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper 102 * "Fast Software Polynomial Multiplication on ARM Processors Using 103 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and 104 * Ricardo Dahab (https://hal.inria.fr/hal-01506572) 105 * 106 * It has been slightly tweaked for in-order performance, and to allow 107 * 'rq' to overlap with 'ad' or 'bd'. 108 */ 109 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l 110 vext.8 t0l, \ad, \ad, #1 @ A1 111 .ifc \b1, t4l 112 vext.8 t4l, \bd, \bd, #1 @ B1 113 .endif 114 vmull.p8 t0q, t0l, \bd @ F = A1*B 115 vext.8 t1l, \ad, \ad, #2 @ A2 116 vmull.p8 t4q, \ad, \b1 @ E = A*B1 117 .ifc \b2, t3l 118 vext.8 t3l, \bd, \bd, #2 @ B2 119 .endif 120 vmull.p8 t1q, t1l, \bd @ H = A2*B 121 vext.8 t2l, \ad, \ad, #3 @ A3 122 vmull.p8 t3q, \ad, \b2 @ G = A*B2 123 veor t0q, t0q, t4q @ L = E + F 124 .ifc \b3, t4l 125 vext.8 t4l, \bd, \bd, #3 @ B3 126 .endif 127 vmull.p8 t2q, t2l, \bd @ J = A3*B 128 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 129 veor t1q, t1q, t3q @ M = G + H 130 .ifc \b4, t3l 131 vext.8 t3l, \bd, \bd, #4 @ B4 132 .endif 133 vmull.p8 t4q, \ad, \b3 @ I = A*B3 134 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 135 vmull.p8 t3q, \ad, \b4 @ K = A*B4 136 vand t0h, t0h, k48 137 vand t1h, t1h, k32 138 veor t2q, t2q, t4q @ N = I + J 139 veor t0l, t0l, t0h 140 veor t1l, t1l, t1h 141 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 142 vand t2h, t2h, k16 143 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 144 vmov.i64 t3h, #0 145 vext.8 t0q, t0q, t0q, #15 146 veor t2l, t2l, t2h 147 vext.8 t1q, t1q, t1q, #14 148 vmull.p8 \rq, \ad, \bd @ D = A*B 149 vext.8 t2q, t2q, t2q, #13 150 vext.8 t3q, t3q, t3q, #12 151 veor t0q, t0q, t1q 152 veor t2q, t2q, t3q 153 veor \rq, \rq, t0q 154 veor \rq, \rq, t2q 155 .endm 156 157 // 158 // PMULL (64x64->128) based reduction for CPUs that can do 159 // it in a single instruction. 160 // 161 .macro __pmull_reduce_p64 162 vmull.p64 T1, XL_L, MASK 163 164 veor XH_L, XH_L, XM_H 165 vext.8 T1, T1, T1, #8 166 veor XL_H, XL_H, XM_L 167 veor T1, T1, XL 168 169 vmull.p64 XL, T1_H, MASK 170 .endm 171 172 // 173 // Alternative reduction for CPUs that lack support for the 174 // 64x64->128 PMULL instruction 175 // 176 .macro __pmull_reduce_p8 177 veor XL_H, XL_H, XM_L 178 veor XH_L, XH_L, XM_H 179 180 vshl.i64 T1, XL, #57 181 vshl.i64 T2, XL, #62 182 veor T1, T1, T2 183 vshl.i64 T2, XL, #63 184 veor T1, T1, T2 185 veor XL_H, XL_H, T1_L 186 veor XH_L, XH_L, T1_H 187 188 vshr.u64 T1, XL, #1 189 veor XH, XH, XL 190 veor XL, XL, T1 191 vshr.u64 T1, T1, #6 192 vshr.u64 XL, XL, #1 193 .endm 194 195 .macro ghash_update, pn 196 vld1.64 {XL}, [r1] 197 198 /* do the head block first, if supplied */ 199 ldr ip, [sp] 200 teq ip, #0 201 beq 0f 202 vld1.64 {T1}, [ip] 203 teq r0, #0 204 b 3f 205 2060: .ifc \pn, p64 207 tst r0, #3 // skip until #blocks is a 208 bne 2f // round multiple of 4 209 210 vld1.8 {XL2-XM2}, [r2]! 2111: vld1.8 {T3-T2}, [r2]! 212 vrev64.8 XL2, XL2 213 vrev64.8 XM2, XM2 214 215 subs r0, r0, #4 216 217 vext.8 T1, XL2, XL2, #8 218 veor XL2_H, XL2_H, XL_L 219 veor XL, XL, T1 220 221 vrev64.8 T3, T3 222 vrev64.8 T1, T2 223 224 vmull.p64 XH, HH4_H, XL_H // a1 * b1 225 veor XL2_H, XL2_H, XL_H 226 vmull.p64 XL, HH4_L, XL_L // a0 * b0 227 vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0) 228 229 vmull.p64 XH2, HH3_H, XM2_L // a1 * b1 230 veor XM2_L, XM2_L, XM2_H 231 vmull.p64 XL2, HH3_L, XM2_H // a0 * b0 232 vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0) 233 234 veor XH, XH, XH2 235 veor XL, XL, XL2 236 veor XM, XM, XM2 237 238 vmull.p64 XH2, HH_H, T3_L // a1 * b1 239 veor T3_L, T3_L, T3_H 240 vmull.p64 XL2, HH_L, T3_H // a0 * b0 241 vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0) 242 243 veor XH, XH, XH2 244 veor XL, XL, XL2 245 veor XM, XM, XM2 246 247 vmull.p64 XH2, SHASH_H, T1_L // a1 * b1 248 veor T1_L, T1_L, T1_H 249 vmull.p64 XL2, SHASH_L, T1_H // a0 * b0 250 vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0) 251 252 veor XH, XH, XH2 253 veor XL, XL, XL2 254 veor XM, XM, XM2 255 256 beq 4f 257 258 vld1.8 {XL2-XM2}, [r2]! 259 260 veor T1, XL, XH 261 veor XM, XM, T1 262 263 __pmull_reduce_p64 264 265 veor T1, T1, XH 266 veor XL, XL, T1 267 268 b 1b 269 .endif 270 2712: vld1.64 {T1}, [r2]! 272 subs r0, r0, #1 273 2743: /* multiply XL by SHASH in GF(2^128) */ 275#ifndef CONFIG_CPU_BIG_ENDIAN 276 vrev64.8 T1, T1 277#endif 278 vext.8 IN1, T1, T1, #8 279 veor T1_L, T1_L, XL_H 280 veor XL, XL, IN1 281 282 __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 283 veor T1, T1, XL 284 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 285 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) 286 2874: veor T1, XL, XH 288 veor XM, XM, T1 289 290 __pmull_reduce_\pn 291 292 veor T1, T1, XH 293 veor XL, XL, T1 294 295 bne 0b 296 297 vst1.64 {XL}, [r1] 298 bx lr 299 .endm 300 301 /* 302 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 303 * struct ghash_key const *k, const char *head) 304 */ 305ENTRY(pmull_ghash_update_p64) 306 vld1.64 {SHASH}, [r3]! 307 vld1.64 {HH}, [r3]! 308 vld1.64 {HH3-HH4}, [r3] 309 310 veor SHASH2_p64, SHASH_L, SHASH_H 311 veor SHASH2_H, HH_L, HH_H 312 veor HH34_L, HH3_L, HH3_H 313 veor HH34_H, HH4_L, HH4_H 314 315 vmov.i8 MASK, #0xe1 316 vshl.u64 MASK, MASK, #57 317 318 ghash_update p64 319ENDPROC(pmull_ghash_update_p64) 320 321ENTRY(pmull_ghash_update_p8) 322 vld1.64 {SHASH}, [r3] 323 veor SHASH2_p8, SHASH_L, SHASH_H 324 325 vext.8 s1l, SHASH_L, SHASH_L, #1 326 vext.8 s2l, SHASH_L, SHASH_L, #2 327 vext.8 s3l, SHASH_L, SHASH_L, #3 328 vext.8 s4l, SHASH_L, SHASH_L, #4 329 vext.8 s1h, SHASH_H, SHASH_H, #1 330 vext.8 s2h, SHASH_H, SHASH_H, #2 331 vext.8 s3h, SHASH_H, SHASH_H, #3 332 vext.8 s4h, SHASH_H, SHASH_H, #4 333 334 vmov.i64 k16, #0xffff 335 vmov.i64 k32, #0xffffffff 336 vmov.i64 k48, #0xffffffffffff 337 338 ghash_update p8 339ENDPROC(pmull_ghash_update_p8)