Revision control

Copy as Markdown

Other Tools

/* rijndael-aarch64.S - ARMv8/AArch64 assembly implementation of AES cipher
*
* Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "asm-common-aarch64.h"
#if defined(__AARCH64EL__)
#ifdef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
.text
/* register macros */
#define CTX x0
#define RDST x1
#define RSRC x2
#define NROUNDS w3
#define RTAB x4
#define RMASK w5
#define RA w8
#define RB w9
#define RC w10
#define RD w11
#define RNA w12
#define RNB w13
#define RNC w14
#define RND w15
#define RT0 w6
#define RT1 w7
#define RT2 w16
#define xRT0 x6
#define xRT1 x7
#define xRT2 x16
#define xw8 x8
#define xw9 x9
#define xw10 x10
#define xw11 x11
#define xw12 x12
#define xw13 x13
#define xw14 x14
#define xw15 x15
/***********************************************************************
* ARMv8/AArch64 assembly implementation of the AES cipher
***********************************************************************/
#define preload_first_key(round, ra) \
ldr ra, [CTX, #(((round) * 16) + 0 * 4)];
#define dummy(round, ra) /* nothing */
#define addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
ldp rna, rnb, [CTX]; \
ldp rnc, rnd, [CTX, #8]; \
eor ra, ra, rna; \
eor rb, rb, rnb; \
eor rc, rc, rnc; \
preload_key(1, rna); \
eor rd, rd, rnd;
#define do_encround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
\
and RT0, RMASK, ra, lsl#2; \
ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
and RT1, RMASK, ra, lsr#(8 - 2); \
ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
and RT2, RMASK, ra, lsr#(16 - 2); \
ldr RT0, [RTAB, xRT0]; \
and ra, RMASK, ra, lsr#(24 - 2); \
\
ldr RT1, [RTAB, xRT1]; \
eor rna, rna, RT0; \
ldr RT2, [RTAB, xRT2]; \
and RT0, RMASK, rd, lsl#2; \
ldr ra, [RTAB, x##ra]; \
\
eor rnd, rnd, RT1, ror #24; \
and RT1, RMASK, rd, lsr#(8 - 2); \
eor rnc, rnc, RT2, ror #16; \
and RT2, RMASK, rd, lsr#(16 - 2); \
eor rnb, rnb, ra, ror #8; \
ldr RT0, [RTAB, xRT0]; \
and rd, RMASK, rd, lsr#(24 - 2); \
\
ldr RT1, [RTAB, xRT1]; \
eor rnd, rnd, RT0; \
ldr RT2, [RTAB, xRT2]; \
and RT0, RMASK, rc, lsl#2; \
ldr rd, [RTAB, x##rd]; \
\
eor rnc, rnc, RT1, ror #24; \
and RT1, RMASK, rc, lsr#(8 - 2); \
eor rnb, rnb, RT2, ror #16; \
and RT2, RMASK, rc, lsr#(16 - 2); \
eor rna, rna, rd, ror #8; \
ldr RT0, [RTAB, xRT0]; \
and rc, RMASK, rc, lsr#(24 - 2); \
\
ldr RT1, [RTAB, xRT1]; \
eor rnc, rnc, RT0; \
ldr RT2, [RTAB, xRT2]; \
and RT0, RMASK, rb, lsl#2; \
ldr rc, [RTAB, x##rc]; \
\
eor rnb, rnb, RT1, ror #24; \
and RT1, RMASK, rb, lsr#(8 - 2); \
eor rna, rna, RT2, ror #16; \
and RT2, RMASK, rb, lsr#(16 - 2); \
eor rnd, rnd, rc, ror #8; \
ldr RT0, [RTAB, xRT0]; \
and rb, RMASK, rb, lsr#(24 - 2); \
\
ldr RT1, [RTAB, xRT1]; \
eor rnb, rnb, RT0; \
ldr RT2, [RTAB, xRT2]; \
eor rna, rna, RT1, ror #24; \
ldr rb, [RTAB, x##rb]; \
\
eor rnd, rnd, RT2, ror #16; \
preload_key((next_r) + 1, ra); \
eor rnc, rnc, rb, ror #8;
#define do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
and RT0, RMASK, ra, lsl#2; \
and RT1, RMASK, ra, lsr#(8 - 2); \
and RT2, RMASK, ra, lsr#(16 - 2); \
ldrb rna, [RTAB, xRT0]; \
and ra, RMASK, ra, lsr#(24 - 2); \
ldrb rnd, [RTAB, xRT1]; \
and RT0, RMASK, rd, lsl#2; \
ldrb rnc, [RTAB, xRT2]; \
ror rnd, rnd, #24; \
ldrb rnb, [RTAB, x##ra]; \
and RT1, RMASK, rd, lsr#(8 - 2); \
ror rnc, rnc, #16; \
and RT2, RMASK, rd, lsr#(16 - 2); \
ror rnb, rnb, #8; \
ldrb RT0, [RTAB, xRT0]; \
and rd, RMASK, rd, lsr#(24 - 2); \
ldrb RT1, [RTAB, xRT1]; \
\
orr rnd, rnd, RT0; \
ldrb RT2, [RTAB, xRT2]; \
and RT0, RMASK, rc, lsl#2; \
ldrb rd, [RTAB, x##rd]; \
orr rnc, rnc, RT1, ror #24; \
and RT1, RMASK, rc, lsr#(8 - 2); \
orr rnb, rnb, RT2, ror #16; \
and RT2, RMASK, rc, lsr#(16 - 2); \
orr rna, rna, rd, ror #8; \
ldrb RT0, [RTAB, xRT0]; \
and rc, RMASK, rc, lsr#(24 - 2); \
ldrb RT1, [RTAB, xRT1]; \
\
orr rnc, rnc, RT0; \
ldrb RT2, [RTAB, xRT2]; \
and RT0, RMASK, rb, lsl#2; \
ldrb rc, [RTAB, x##rc]; \
orr rnb, rnb, RT1, ror #24; \
and RT1, RMASK, rb, lsr#(8 - 2); \
orr rna, rna, RT2, ror #16; \
ldrb RT0, [RTAB, xRT0]; \
and RT2, RMASK, rb, lsr#(16 - 2); \
ldrb RT1, [RTAB, xRT1]; \
orr rnd, rnd, rc, ror #8; \
ldrb RT2, [RTAB, xRT2]; \
and rb, RMASK, rb, lsr#(24 - 2); \
ldrb rb, [RTAB, x##rb]; \
\
orr rnb, rnb, RT0; \
orr rna, rna, RT1, ror #24; \
orr rnd, rnd, RT2, ror #16; \
orr rnc, rnc, rb, ror #8;
#define firstencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key); \
do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
#define encround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
#define lastencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
add CTX, CTX, #(((round) + 1) * 16); \
add RTAB, RTAB, #1; \
do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
.globl _gcry_aes_arm_encrypt_block
ELF(.type _gcry_aes_arm_encrypt_block,%function;)
_gcry_aes_arm_encrypt_block:
/* input:
* %x0: keysched, CTX
* %x1: dst
* %x2: src
* %w3: number of rounds.. 10, 12 or 14
* %x4: encryption table
*/
CFI_STARTPROC();
/* read input block */
/* aligned load */
ldp RA, RB, [RSRC];
ldp RC, RD, [RSRC, #8];
#ifndef __AARCH64EL__
rev RA, RA;
rev RB, RB;
rev RC, RC;
rev RD, RD;
#endif
mov RMASK, #(0xff<<2);
firstencround(0, RA, RB, RC, RD, RNA, RNB, RNC, RND);
encround(1, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
encround(2, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
encround(3, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
encround(4, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
encround(5, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
encround(6, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
encround(7, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
cmp NROUNDS, #12;
bge .Lenc_not_128;
encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
lastencround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD);
.Lenc_done:
/* store output block */
/* aligned store */
#ifndef __AARCH64EL__
rev RA, RA;
rev RB, RB;
rev RC, RC;
rev RD, RD;
#endif
/* write output block */
stp RA, RB, [RDST];
stp RC, RD, [RDST, #8];
mov x0, #(0);
ret;
.ltorg
.Lenc_not_128:
beq .Lenc_192
encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
encround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
encround(12, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
lastencround(13, RNA, RNB, RNC, RND, RA, RB, RC, RD);
b .Lenc_done;
.ltorg
.Lenc_192:
encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
lastencround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD);
b .Lenc_done;
CFI_ENDPROC();
ELF(.size _gcry_aes_arm_encrypt_block,.-_gcry_aes_arm_encrypt_block;)
#define addroundkey_dec(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
ldr rna, [CTX, #(((round) * 16) + 0 * 4)]; \
ldr rnb, [CTX, #(((round) * 16) + 1 * 4)]; \
eor ra, ra, rna; \
ldr rnc, [CTX, #(((round) * 16) + 2 * 4)]; \
eor rb, rb, rnb; \
ldr rnd, [CTX, #(((round) * 16) + 3 * 4)]; \
eor rc, rc, rnc; \
preload_first_key((round) - 1, rna); \
eor rd, rd, rnd;
#define do_decround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
\
and RT0, RMASK, ra, lsl#2; \
ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
and RT1, RMASK, ra, lsr#(8 - 2); \
ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
and RT2, RMASK, ra, lsr#(16 - 2); \
ldr RT0, [RTAB, xRT0]; \
and ra, RMASK, ra, lsr#(24 - 2); \
\
ldr RT1, [RTAB, xRT1]; \
eor rna, rna, RT0; \
ldr RT2, [RTAB, xRT2]; \
and RT0, RMASK, rb, lsl#2; \
ldr ra, [RTAB, x##ra]; \
\
eor rnb, rnb, RT1, ror #24; \
and RT1, RMASK, rb, lsr#(8 - 2); \
eor rnc, rnc, RT2, ror #16; \
and RT2, RMASK, rb, lsr#(16 - 2); \
eor rnd, rnd, ra, ror #8; \
ldr RT0, [RTAB, xRT0]; \
and rb, RMASK, rb, lsr#(24 - 2); \
\
ldr RT1, [RTAB, xRT1]; \
eor rnb, rnb, RT0; \
ldr RT2, [RTAB, xRT2]; \
and RT0, RMASK, rc, lsl#2; \
ldr rb, [RTAB, x##rb]; \
\
eor rnc, rnc, RT1, ror #24; \
and RT1, RMASK, rc, lsr#(8 - 2); \
eor rnd, rnd, RT2, ror #16; \
and RT2, RMASK, rc, lsr#(16 - 2); \
eor rna, rna, rb, ror #8; \
ldr RT0, [RTAB, xRT0]; \
and rc, RMASK, rc, lsr#(24 - 2); \
\
ldr RT1, [RTAB, xRT1]; \
eor rnc, rnc, RT0; \
ldr RT2, [RTAB, xRT2]; \
and RT0, RMASK, rd, lsl#2; \
ldr rc, [RTAB, x##rc]; \
\
eor rnd, rnd, RT1, ror #24; \
and RT1, RMASK, rd, lsr#(8 - 2); \
eor rna, rna, RT2, ror #16; \
and RT2, RMASK, rd, lsr#(16 - 2); \
eor rnb, rnb, rc, ror #8; \
ldr RT0, [RTAB, xRT0]; \
and rd, RMASK, rd, lsr#(24 - 2); \
\
ldr RT1, [RTAB, xRT1]; \
eor rnd, rnd, RT0; \
ldr RT2, [RTAB, xRT2]; \
eor rna, rna, RT1, ror #24; \
ldr rd, [RTAB, x##rd]; \
\
eor rnb, rnb, RT2, ror #16; \
preload_key((next_r) - 1, ra); \
eor rnc, rnc, rd, ror #8;
#define do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
and RT0, RMASK, ra; \
and RT1, RMASK, ra, lsr#8; \
and RT2, RMASK, ra, lsr#16; \
ldrb rna, [RTAB, xRT0]; \
lsr ra, ra, #24; \
ldrb rnb, [RTAB, xRT1]; \
and RT0, RMASK, rb; \
ldrb rnc, [RTAB, xRT2]; \
ror rnb, rnb, #24; \
ldrb rnd, [RTAB, x##ra]; \
and RT1, RMASK, rb, lsr#8; \
ror rnc, rnc, #16; \
and RT2, RMASK, rb, lsr#16; \
ror rnd, rnd, #8; \
ldrb RT0, [RTAB, xRT0]; \
lsr rb, rb, #24; \
ldrb RT1, [RTAB, xRT1]; \
\
orr rnb, rnb, RT0; \
ldrb RT2, [RTAB, xRT2]; \
and RT0, RMASK, rc; \
ldrb rb, [RTAB, x##rb]; \
orr rnc, rnc, RT1, ror #24; \
and RT1, RMASK, rc, lsr#8; \
orr rnd, rnd, RT2, ror #16; \
and RT2, RMASK, rc, lsr#16; \
orr rna, rna, rb, ror #8; \
ldrb RT0, [RTAB, xRT0]; \
lsr rc, rc, #24; \
ldrb RT1, [RTAB, xRT1]; \
\
orr rnc, rnc, RT0; \
ldrb RT2, [RTAB, xRT2]; \
and RT0, RMASK, rd; \
ldrb rc, [RTAB, x##rc]; \
orr rnd, rnd, RT1, ror #24; \
and RT1, RMASK, rd, lsr#8; \
orr rna, rna, RT2, ror #16; \
ldrb RT0, [RTAB, xRT0]; \
and RT2, RMASK, rd, lsr#16; \
ldrb RT1, [RTAB, xRT1]; \
orr rnb, rnb, rc, ror #8; \
ldrb RT2, [RTAB, xRT2]; \
lsr rd, rd, #24; \
ldrb rd, [RTAB, x##rd]; \
\
orr rnd, rnd, RT0; \
orr rna, rna, RT1, ror #24; \
orr rnb, rnb, RT2, ror #16; \
orr rnc, rnc, rd, ror #8;
#define firstdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
addroundkey_dec(((round) + 1), ra, rb, rc, rd, rna, rnb, rnc, rnd); \
do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
#define decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
#define set_last_round_rmask(_, __) \
mov RMASK, #0xff;
#define lastdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
add RTAB, RTAB, #(4 * 256); \
do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
.globl _gcry_aes_arm_decrypt_block
ELF(.type _gcry_aes_arm_decrypt_block,%function;)
_gcry_aes_arm_decrypt_block:
/* input:
* %x0: keysched, CTX
* %x1: dst
* %x2: src
* %w3: number of rounds.. 10, 12 or 14
* %x4: decryption table
*/
CFI_STARTPROC();
/* read input block */
/* aligned load */
ldp RA, RB, [RSRC];
ldp RC, RD, [RSRC, #8];
#ifndef __AARCH64EL__
rev RA, RA;
rev RB, RB;
rev RC, RC;
rev RD, RD;
#endif
mov RMASK, #(0xff << 2);
cmp NROUNDS, #12;
bge .Ldec_256;
firstdecround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND);
.Ldec_tail:
decround(8, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
decround(7, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
decround(6, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
decround(5, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
decround(4, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
decround(3, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
decround(2, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
decround(1, RA, RB, RC, RD, RNA, RNB, RNC, RND, set_last_round_rmask);
lastdecround(0, RNA, RNB, RNC, RND, RA, RB, RC, RD);
/* store output block */
/* aligned store */
#ifndef __AARCH64EL__
rev RA, RA;
rev RB, RB;
rev RC, RC;
rev RD, RD;
#endif
/* write output block */
stp RA, RB, [RDST];
stp RC, RD, [RDST, #8];
mov x0, #(0);
ret;
.ltorg
.Ldec_256:
beq .Ldec_192;
firstdecround(13, RA, RB, RC, RD, RNA, RNB, RNC, RND);
decround(12, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
decround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
b .Ldec_tail;
.ltorg
.Ldec_192:
firstdecround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND);
decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
b .Ldec_tail;
CFI_ENDPROC();
ELF(.size _gcry_aes_arm_decrypt_block,.-_gcry_aes_arm_decrypt_block;)
#endif /*HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS*/
#endif /*__AARCH64EL__ */