Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ clean:
rm -rf ${pwd}/meson
$(MAKE) clean -C ${pwd}/lib/lua54/src
$(MAKE) clean -C ${pwd}/lib/pqclean
$(MAKE) clean -C ${pwd}/lib/mayo
$(MAKE) clean -C ${pwd}/lib/mlkem
$(MAKE) clean -C ${pwd}/lib/longfellow-zk
$(MAKE) clean -C ${pwd}/lib/zk-circuit-lang
Expand Down
5 changes: 5 additions & 0 deletions REUSE.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,8 @@ precedence = "aggregate"
SPDX-FileCopyrightText = "Copyright (c) 2025 Google LLC."
SPDX-License-Identifier = "Apache-2.0"

[[annotations]]
path = ["lib/mayo/**"]
precedence = "override"
SPDX-FileCopyrightText = "Copyright (c) MAYO-C authors"
SPDX-License-Identifier = "Apache-2.0 AND MIT"
10 changes: 10 additions & 0 deletions build/deps.mk
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,13 @@ mlkem:
test/build/libmlkem.a \
OPT=0 Q="" AUTO=0 MLKEM_K=4 \
MLK_MULTILEVEL_BUILD_WITH_SHARED=1

mayo:
$(info -- Building MAYO libs)
CC="${mayo_cc}" \
LD=${ld} \
AR=${ar} \
CFLAGS="${mayo_cflags} ${cflags}" \
RANLIB=${ranlib} \
LDFLAGS="${ldflags}" \
${MAKE} -C ${pwd}/lib/mayo
14 changes: 10 additions & 4 deletions build/init.mk
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@ ZEN_SOURCES := src/zenroom.o src/zen_error.o src/lua_functions.o \
src/zen_ed.o src/zen_float.o src/zen_time.o src/api_hash.o \
src/api_sign.o src/randombytes.o src/zen_fuzzer.o src/cortex_m.o \
src/p256-m.o src/zen_p256.o src/zen_rsa.o src/zen_bbs.o \
src/zen_longfellow.o
src/zen_longfellow.o src/zen_mayo.o

ZEN_INCLUDES += -Isrc -Ilib/lua54/src -Ilib -I/usr/local/include \
-Ilib/milagro-crypto-c/build/include -Ilib/milagro-crypto-c/include \
-Ilib/ed25519-donna -Ilib/longfellow-zk -Wall -Wextra
-Ilib/ed25519-donna -Ilib/longfellow-zk -Ilib/mayo -Wall -Wextra

BUILD_DEPS ?= apply-patches milagro lua54 embed-lua mlkem \
quantum-proof ed25519-donna longfellow-zk \
zk-circuit-lang zstd
zk-circuit-lang zstd mayo

pwd := $(shell pwd)
mil := ${pwd}/build/milagro
Expand All @@ -48,6 +48,7 @@ ldadd += ${pwd}/lib/mlkem/test/build/libmlkem.a
ldadd += ${pwd}/lib/longfellow-zk/liblongfellow-zk.a
ldadd += ${pwd}/lib/zstd/libzstd.a
ldadd += ${pwd}/lib/zk-circuit-lang/libzk-circuit-lang.a
ldadd += $(pwd)/lib/mayo/libmayo.a
ldadd += -lstdc++

# ----------------
Expand Down Expand Up @@ -148,4 +149,9 @@ endif
# zstd settings
zstd_cc ?= ${cc}

# }}}
#-----------------
# mayo settings
mayo_cc ?= ${cc}
mayo_cflags ?= -I ${pwd}/src -I. -fPIC -DMAYO_BUILD_TYPE_REF=1 -DMAYO_VARIANT=MAYO_5

# }}
2 changes: 1 addition & 1 deletion build/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ endif

## BATS tests in vectors
if suite.contains('vectors')
tests = [ 'aes', 'ecdsa_p256', 'hmac', 'eddsa', 'sha', 'rsa', 'qp', 'merkletree' ]
tests = [ 'aes', 'ecdsa_p256', 'hmac', 'eddsa', 'sha', 'rsa', 'qp', 'merkletree', 'mayo' ]
foreach test_suite : tests
test('vectors_'+test_suite.underscorify(),
bats_bin,
Expand Down
13 changes: 13 additions & 0 deletions lib/mayo/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
CFLAGS ?= -O2 -I../../src -I. -fPIC -fstack-protector-all -D_FORTIFY_SOURCE=2 -fno-strict-overflow -DMAYO_BUILD_TYPE_REF=1 -DMAYO_VARIANT=MAYO_5
CC ?= gcc

MAYO=aes_c.o aes128ctr.o mem.o api.o mayo.o params.o arithmetic.o

all: libmayo.a

libmayo.a: ${MAYO}
${AR} -r libmayo.a ${MAYO}

clean:
rm -f *.a
rm -f ${MAYO}
293 changes: 293 additions & 0 deletions lib/mayo/aes128ctr.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
// SPDX-License-Identifier: Apache-2.0 and MIT and Public Domain

#ifdef ENABLE_AESNI

#include <mem.h>
#include <stdint.h>
#include <string.h>
#include <tmmintrin.h>
#include <wmmintrin.h>

// Adapted from liboqs/src/common/aes which in turn takes it from:
// crypto_core/aes128ncrypt/dolbeau/aesenc-int
// (https://bench.cr.yp.to/supercop.html)
static inline void aes128ni_setkey_encrypt(const unsigned char *key,
__m128i rkeys[11]) {
__m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0));
__m128i temp0, temp1, temp4;
int idx = 0;

temp0 = key0;

#define BLOCK1(IMM) \
temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \
rkeys[idx++] = temp0; \
temp4 = _mm_slli_si128(temp0, 4); \
temp0 = _mm_xor_si128(temp0, temp4); \
temp4 = _mm_slli_si128(temp0, 8); \
temp0 = _mm_xor_si128(temp0, temp4); \
temp1 = _mm_shuffle_epi32(temp1, 0xff); \
temp0 = _mm_xor_si128(temp0, temp1)

BLOCK1(0x01);
BLOCK1(0x02);
BLOCK1(0x04);
BLOCK1(0x08);
BLOCK1(0x10);
BLOCK1(0x20);
BLOCK1(0x40);
BLOCK1(0x80);
BLOCK1(0x1b);
BLOCK1(0x36);
rkeys[idx++] = temp0;
}

void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule) {
*_schedule = malloc(11 * sizeof(__m128i));
// assert(*_schedule != NULL);
__m128i *schedule = (__m128i *)*_schedule;
aes128ni_setkey_encrypt(key, schedule);
}

void oqs_aes128_free_schedule_ni(void *schedule) {
if (schedule != NULL) {
mayo_secure_free(schedule, 11 * sizeof(__m128i));
}
}

// Single encryption
static inline void aes128ni_encrypt(const __m128i rkeys[11], __m128i nv,
unsigned char *out) {
__m128i temp = _mm_xor_si128(nv, rkeys[0]);
temp = _mm_aesenc_si128(temp, rkeys[1]);
temp = _mm_aesenc_si128(temp, rkeys[2]);
temp = _mm_aesenc_si128(temp, rkeys[3]);
temp = _mm_aesenc_si128(temp, rkeys[4]);
temp = _mm_aesenc_si128(temp, rkeys[5]);
temp = _mm_aesenc_si128(temp, rkeys[6]);
temp = _mm_aesenc_si128(temp, rkeys[7]);
temp = _mm_aesenc_si128(temp, rkeys[8]);
temp = _mm_aesenc_si128(temp, rkeys[9]);
temp = _mm_aesenclast_si128(temp, rkeys[10]);
_mm_storeu_si128((__m128i *)(out), temp);
}

// 4x interleaved encryption
static inline void aes128ni_encrypt_x4(const __m128i rkeys[11], __m128i n0,
__m128i n1, __m128i n2, __m128i n3,
unsigned char *out) {
__m128i temp0 = _mm_xor_si128(n0, rkeys[0]);
__m128i temp1 = _mm_xor_si128(n1, rkeys[0]);
__m128i temp2 = _mm_xor_si128(n2, rkeys[0]);
__m128i temp3 = _mm_xor_si128(n3, rkeys[0]);

#define AESNENCX4(IDX) \
temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \
temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \
temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \
temp3 = _mm_aesenc_si128(temp3, rkeys[IDX])

AESNENCX4(1);
AESNENCX4(2);
AESNENCX4(3);
AESNENCX4(4);
AESNENCX4(5);
AESNENCX4(6);
AESNENCX4(7);
AESNENCX4(8);
AESNENCX4(9);

temp0 = _mm_aesenclast_si128(temp0, rkeys[10]);
temp1 = _mm_aesenclast_si128(temp1, rkeys[10]);
temp2 = _mm_aesenclast_si128(temp2, rkeys[10]);
temp3 = _mm_aesenclast_si128(temp3, rkeys[10]);

_mm_storeu_si128((__m128i *)(out + 0), temp0);
_mm_storeu_si128((__m128i *)(out + 16), temp1);
_mm_storeu_si128((__m128i *)(out + 32), temp2);
_mm_storeu_si128((__m128i *)(out + 48), temp3);
}

// Not for general use: IV = 0, nonce = 0
static void oqs_aes128_ctr_enc_sch_ni(const void *schedule, uint8_t *out,
size_t out_len) {
__m128i mask =
_mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
__m128i block = _mm_set_epi64x(0, 0);
// block = _mm_xor_si128(block, block); // set to zero

while (out_len >= 64) {
__m128i nv0 = block;
__m128i nv1 = _mm_shuffle_epi8(
_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)),
mask);
__m128i nv2 = _mm_shuffle_epi8(
_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)),
mask);
__m128i nv3 = _mm_shuffle_epi8(
_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)),
mask);
aes128ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out);
block = _mm_shuffle_epi8(
_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)),
mask);
out += 64;
out_len -= 64;
}
while (out_len >= 16) {
aes128ni_encrypt(schedule, block, out);
out += 16;
out_len -= 16;
block = _mm_shuffle_epi8(
_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)),
mask);
}
if (out_len > 0) {
uint8_t tmp[16];
aes128ni_encrypt(schedule, block, tmp);
memcpy(out, tmp, out_len);
}
}

int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
const unsigned char *input, size_t inputByteLen) {
void *schedule = NULL;
oqs_aes128_load_schedule_ni(input, &schedule);
oqs_aes128_ctr_enc_sch_ni(schedule, output, outputByteLen);
oqs_aes128_free_schedule_ni(schedule);
return (int)outputByteLen;
}

// 4-Round AES...

// From crypto_core/aes128ncrypt/dolbeau/aesenc-int
static inline void aes128r4ni_setkey_encrypt(const unsigned char *key,
__m128i rkeys[5]) {
__m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0));
__m128i temp0, temp1, temp4;
int idx = 0;

temp0 = key0;

/* blockshift-based block by Cedric Bourrasset */
#define BLOCK1(IMM) \
temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \
rkeys[idx++] = temp0; \
temp4 = _mm_slli_si128(temp0, 4); \
temp0 = _mm_xor_si128(temp0, temp4); \
temp4 = _mm_slli_si128(temp0, 8); \
temp0 = _mm_xor_si128(temp0, temp4); \
temp1 = _mm_shuffle_epi32(temp1, 0xff); \
temp0 = _mm_xor_si128(temp0, temp1)

BLOCK1(0x01);
BLOCK1(0x02);
BLOCK1(0x04);
BLOCK1(0x08);
rkeys[idx++] = temp0;
}

void oqs_aes128r4_load_schedule_ni(const uint8_t *key, void **_schedule) {
*_schedule = malloc(5 * sizeof(__m128i));
// assert(*_schedule != NULL);
__m128i *schedule = (__m128i *)*_schedule;
aes128r4ni_setkey_encrypt(key, schedule);
}

void oqs_aes128r4_free_schedule_ni(void *schedule) {
if (schedule != NULL) {
mayo_secure_free(schedule, 5 * sizeof(__m128i));
}
}

// Single encryption
static inline void aes128r4ni_encrypt(const __m128i rkeys[5], __m128i nv,
unsigned char *out) {
__m128i temp = _mm_xor_si128(nv, rkeys[0]);
temp = _mm_aesenc_si128(temp, rkeys[1]);
temp = _mm_aesenc_si128(temp, rkeys[2]);
temp = _mm_aesenc_si128(temp, rkeys[3]);
temp = _mm_aesenclast_si128(temp, rkeys[4]);
_mm_storeu_si128((__m128i *)(out), temp);
}

// 4x interleaved encryption
static inline void aes128r4ni_encrypt_x4(const __m128i rkeys[5], __m128i n0,
__m128i n1, __m128i n2, __m128i n3,
unsigned char *out) {
__m128i temp0 = _mm_xor_si128(n0, rkeys[0]);
__m128i temp1 = _mm_xor_si128(n1, rkeys[0]);
__m128i temp2 = _mm_xor_si128(n2, rkeys[0]);
__m128i temp3 = _mm_xor_si128(n3, rkeys[0]);

#define AESNENCX4(IDX) \
temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \
temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \
temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \
temp3 = _mm_aesenc_si128(temp3, rkeys[IDX])

AESNENCX4(1);
AESNENCX4(2);
AESNENCX4(3);

temp0 = _mm_aesenclast_si128(temp0, rkeys[4]);
temp1 = _mm_aesenclast_si128(temp1, rkeys[4]);
temp2 = _mm_aesenclast_si128(temp2, rkeys[4]);
temp3 = _mm_aesenclast_si128(temp3, rkeys[4]);

_mm_storeu_si128((__m128i *)(out + 0), temp0);
_mm_storeu_si128((__m128i *)(out + 16), temp1);
_mm_storeu_si128((__m128i *)(out + 32), temp2);
_mm_storeu_si128((__m128i *)(out + 48), temp3);
}

// Not for general use: IV = 0, nonce = 0
static void oqs_aes128r4_ctr_enc_sch_ni(const void *schedule, uint8_t *out,
size_t out_len) {
__m128i mask =
_mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
__m128i block = _mm_set_epi64x(0, 0);

while (out_len >= 64) {
__m128i nv0 = block;
__m128i nv1 = _mm_shuffle_epi8(
_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)),
mask);
__m128i nv2 = _mm_shuffle_epi8(
_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)),
mask);
__m128i nv3 = _mm_shuffle_epi8(
_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)),
mask);
aes128r4ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out);
block = _mm_shuffle_epi8(
_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)),
mask);
out += 64;
out_len -= 64;
}
while (out_len >= 16) {
aes128r4ni_encrypt(schedule, block, out);
out += 16;
out_len -= 16;
block = _mm_shuffle_epi8(
_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)),
mask);
}
if (out_len > 0) {
uint8_t tmp[16];
aes128r4ni_encrypt(schedule, block, tmp);
memcpy(out, tmp, out_len);
}
}

int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
const unsigned char *input, size_t inputByteLen) {
void *schedule = NULL;
oqs_aes128r4_load_schedule_ni(input, &schedule);
oqs_aes128r4_ctr_enc_sch_ni(schedule, output, outputByteLen);
oqs_aes128r4_free_schedule_ni(schedule);
return (int)outputByteLen;
}
#endif

Loading